Merge pull request #15903 from Hisma/marker-api-update

feat: Add configurable API URL (for self-hosting) and additional_config parameter for Datalab Marker API
This commit is contained in:
Tim Jaeryang Baek 2025-08-04 15:21:03 +04:00 committed by GitHub
commit 5db60ca34f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 215 additions and 98 deletions

View file

@ -2032,10 +2032,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig(
os.environ.get("DATALAB_MARKER_API_KEY", ""), os.environ.get("DATALAB_MARKER_API_KEY", ""),
) )
DATALAB_MARKER_LANGS = PersistentConfig( DATALAB_MARKER_API_BASE_URL = PersistentConfig(
"DATALAB_MARKER_LANGS", "DATALAB_MARKER_API_BASE_URL",
"rag.datalab_marker_langs", "rag.datalab_marker_api_base_url",
os.environ.get("DATALAB_MARKER_LANGS", ""), os.environ.get("DATALAB_MARKER_API_BASE_URL", ""),
)
DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig(
"DATALAB_MARKER_ADDITIONAL_CONFIG",
"rag.datalab_marker_additional_config",
os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""),
) )
DATALAB_MARKER_USE_LLM = PersistentConfig( DATALAB_MARKER_USE_LLM = PersistentConfig(
@ -2075,6 +2081,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig(
== "true", == "true",
) )
DATALAB_MARKER_FORMAT_LINES = PersistentConfig(
"DATALAB_MARKER_FORMAT_LINES",
"rag.datalab_marker_format_lines",
os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true",
)
DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
"DATALAB_MARKER_OUTPUT_FORMAT", "DATALAB_MARKER_OUTPUT_FORMAT",
"rag.datalab_marker_output_format", "rag.datalab_marker_output_format",

View file

@ -226,12 +226,14 @@ from open_webui.config import (
CHUNK_SIZE, CHUNK_SIZE,
CONTENT_EXTRACTION_ENGINE, CONTENT_EXTRACTION_ENGINE,
DATALAB_MARKER_API_KEY, DATALAB_MARKER_API_KEY,
DATALAB_MARKER_LANGS, DATALAB_MARKER_API_BASE_URL,
DATALAB_MARKER_ADDITIONAL_CONFIG,
DATALAB_MARKER_SKIP_CACHE, DATALAB_MARKER_SKIP_CACHE,
DATALAB_MARKER_FORCE_OCR, DATALAB_MARKER_FORCE_OCR,
DATALAB_MARKER_PAGINATE, DATALAB_MARKER_PAGINATE,
DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_STRIP_EXISTING_OCR,
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_OUTPUT_FORMAT,
DATALAB_MARKER_USE_LLM, DATALAB_MARKER_USE_LLM,
EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_URL,
@ -771,7 +773,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY
app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL
app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG
app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE
app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR
app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE
@ -779,6 +782,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI
app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = ( app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
) )
app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES
app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM
app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT
app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL

View file

@ -15,24 +15,28 @@ class DatalabMarkerLoader:
self, self,
file_path: str, file_path: str,
api_key: str, api_key: str,
langs: Optional[str] = None, api_base_url: str,
additional_config: Optional[str] = None,
use_llm: bool = False, use_llm: bool = False,
skip_cache: bool = False, skip_cache: bool = False,
force_ocr: bool = False, force_ocr: bool = False,
paginate: bool = False, paginate: bool = False,
strip_existing_ocr: bool = False, strip_existing_ocr: bool = False,
disable_image_extraction: bool = False, disable_image_extraction: bool = False,
format_lines: bool = False,
output_format: str = None, output_format: str = None,
): ):
self.file_path = file_path self.file_path = file_path
self.api_key = api_key self.api_key = api_key
self.langs = langs self.api_base_url = api_base_url
self.additional_config = additional_config
self.use_llm = use_llm self.use_llm = use_llm
self.skip_cache = skip_cache self.skip_cache = skip_cache
self.force_ocr = force_ocr self.force_ocr = force_ocr
self.paginate = paginate self.paginate = paginate
self.strip_existing_ocr = strip_existing_ocr self.strip_existing_ocr = strip_existing_ocr
self.disable_image_extraction = disable_image_extraction self.disable_image_extraction = disable_image_extraction
self.format_lines = format_lines
self.output_format = output_format self.output_format = output_format
def _get_mime_type(self, filename: str) -> str: def _get_mime_type(self, filename: str) -> str:
@ -60,7 +64,7 @@ class DatalabMarkerLoader:
return mime_map.get(ext, "application/octet-stream") return mime_map.get(ext, "application/octet-stream")
def check_marker_request_status(self, request_id: str) -> dict: def check_marker_request_status(self, request_id: str) -> dict:
url = f"https://www.datalab.to/api/v1/marker/{request_id}" url = f"{self.api_base_url}/{request_id}"
headers = {"X-Api-Key": self.api_key} headers = {"X-Api-Key": self.api_key}
try: try:
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
@ -81,22 +85,25 @@ class DatalabMarkerLoader:
) )
def load(self) -> List[Document]: def load(self) -> List[Document]:
url = "https://www.datalab.to/api/v1/marker" url = self.api_base_url
filename = os.path.basename(self.file_path) filename = os.path.basename(self.file_path)
mime_type = self._get_mime_type(filename) mime_type = self._get_mime_type(filename)
headers = {"X-Api-Key": self.api_key} headers = {"X-Api-Key": self.api_key}
form_data = { form_data = {
"langs": self.langs,
"use_llm": str(self.use_llm).lower(), "use_llm": str(self.use_llm).lower(),
"skip_cache": str(self.skip_cache).lower(), "skip_cache": str(self.skip_cache).lower(),
"force_ocr": str(self.force_ocr).lower(), "force_ocr": str(self.force_ocr).lower(),
"paginate": str(self.paginate).lower(), "paginate": str(self.paginate).lower(),
"strip_existing_ocr": str(self.strip_existing_ocr).lower(), "strip_existing_ocr": str(self.strip_existing_ocr).lower(),
"disable_image_extraction": str(self.disable_image_extraction).lower(), "disable_image_extraction": str(self.disable_image_extraction).lower(),
"format_lines": str(self.format_lines).lower(),
"output_format": self.output_format, "output_format": self.output_format,
} }
if self.additional_config and self.additional_config.strip():
form_data["additional_config"] = self.additional_config
log.info( log.info(
f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}" f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}"
) )
@ -133,74 +140,92 @@ class DatalabMarkerLoader:
check_url = result.get("request_check_url") check_url = result.get("request_check_url")
request_id = result.get("request_id") request_id = result.get("request_id")
if not check_url:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned."
)
for _ in range(300): # Up to 10 minutes # Check if this is a direct response (self-hosted) or polling response (DataLab)
time.sleep(2) if check_url:
try: # DataLab polling pattern
poll_response = requests.get(check_url, headers=headers) for _ in range(300): # Up to 10 minutes
poll_response.raise_for_status() time.sleep(2)
poll_result = poll_response.json() try:
except (requests.HTTPError, ValueError) as e: poll_response = requests.get(check_url, headers=headers)
raw_body = poll_response.text poll_response.raise_for_status()
log.error(f"Polling error: {e}, response body: {raw_body}") poll_result = poll_response.json()
raise HTTPException( except (requests.HTTPError, ValueError) as e:
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}" raw_body = poll_response.text
) log.error(f"Polling error: {e}, response body: {raw_body}")
raise HTTPException(
status_val = poll_result.get("status") status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
success_val = poll_result.get("success")
if status_val == "complete":
summary = {
k: poll_result.get(k)
for k in (
"status",
"output_format",
"success",
"error",
"page_count",
"total_cost",
) )
}
log.info(
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
)
break
if status_val == "failed" or success_val is False: status_val = poll_result.get("status")
log.error( success_val = poll_result.get("success")
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
) if status_val == "complete":
error_msg = ( summary = {
poll_result.get("error") k: poll_result.get(k)
or "Marker returned failure without error message" for k in (
"status",
"output_format",
"success",
"error",
"page_count",
"total_cost",
)
}
log.info(
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
)
break
if status_val == "failed" or success_val is False:
log.error(
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
)
error_msg = (
poll_result.get("error")
or "Marker returned failure without error message"
)
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Marker processing failed: {error_msg}",
)
else:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="Marker processing timed out",
) )
if not poll_result.get("success", False):
error_msg = poll_result.get("error") or "Unknown processing error"
raise HTTPException( raise HTTPException(
status.HTTP_400_BAD_REQUEST, status.HTTP_400_BAD_REQUEST,
detail=f"Marker processing failed: {error_msg}", detail=f"Final processing failed: {error_msg}",
) )
# DataLab format - content in format-specific fields
content_key = self.output_format.lower()
raw_content = poll_result.get(content_key)
final_result = poll_result
else: else:
raise HTTPException( # Self-hosted direct response - content in "output" field
status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out" if "output" in result:
) log.info("Self-hosted Marker returned direct response without polling")
raw_content = result.get("output")
final_result = result
else:
available_fields = (
list(result.keys())
if isinstance(result, dict)
else "non-dict response"
)
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response.",
)
if not poll_result.get("success", False): if self.output_format.lower() == "json":
error_msg = poll_result.get("error") or "Unknown processing error"
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Final processing failed: {error_msg}",
)
content_key = self.output_format.lower()
raw_content = poll_result.get(content_key)
if content_key == "json":
full_text = json.dumps(raw_content, indent=2) full_text = json.dumps(raw_content, indent=2)
elif content_key in {"markdown", "html"}: elif self.output_format.lower() in {"markdown", "html"}:
full_text = str(raw_content).strip() full_text = str(raw_content).strip()
else: else:
raise HTTPException( raise HTTPException(
@ -211,14 +236,14 @@ class DatalabMarkerLoader:
if not full_text: if not full_text:
raise HTTPException( raise HTTPException(
status.HTTP_400_BAD_REQUEST, status.HTTP_400_BAD_REQUEST,
detail="Datalab Marker returned empty content", detail="Marker returned empty content",
) )
marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output") marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
os.makedirs(marker_output_dir, exist_ok=True) os.makedirs(marker_output_dir, exist_ok=True)
file_ext_map = {"markdown": "md", "json": "json", "html": "html"} file_ext_map = {"markdown": "md", "json": "json", "html": "html"}
file_ext = file_ext_map.get(content_key, "txt") file_ext = file_ext_map.get(self.output_format.lower(), "txt")
output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}" output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}"
output_path = os.path.join(marker_output_dir, output_filename) output_path = os.path.join(marker_output_dir, output_filename)
@ -231,13 +256,13 @@ class DatalabMarkerLoader:
metadata = { metadata = {
"source": filename, "source": filename,
"output_format": poll_result.get("output_format", self.output_format), "output_format": final_result.get("output_format", self.output_format),
"page_count": poll_result.get("page_count", 0), "page_count": final_result.get("page_count", 0),
"processed_with_llm": self.use_llm, "processed_with_llm": self.use_llm,
"request_id": request_id or "", "request_id": request_id or "",
} }
images = poll_result.get("images", {}) images = final_result.get("images", {})
if images: if images:
metadata["image_count"] = len(images) metadata["image_count"] = len(images)
metadata["images"] = json.dumps(list(images.keys())) metadata["images"] = json.dumps(list(images.keys()))

View file

@ -281,10 +281,15 @@ class Loader:
"tiff", "tiff",
] ]
): ):
api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
if not api_base_url or api_base_url.strip() == "":
api_base_url = "https://www.datalab.to/api/v1/marker"
loader = DatalabMarkerLoader( loader = DatalabMarkerLoader(
file_path=file_path, file_path=file_path,
api_key=self.kwargs["DATALAB_MARKER_API_KEY"], api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
langs=self.kwargs.get("DATALAB_MARKER_LANGS"), api_base_url=api_base_url,
additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False), use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False), skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False), force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),
@ -295,6 +300,7 @@ class Loader:
disable_image_extraction=self.kwargs.get( disable_image_extraction=self.kwargs.get(
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
), ),
format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False),
output_format=self.kwargs.get( output_format=self.kwargs.get(
"DATALAB_MARKER_OUTPUT_FORMAT", "markdown" "DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
), ),

View file

@ -401,12 +401,14 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
"DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
"DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
"DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM, "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
"DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
"EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
@ -566,12 +568,14 @@ class ConfigForm(BaseModel):
CONTENT_EXTRACTION_ENGINE: Optional[str] = None CONTENT_EXTRACTION_ENGINE: Optional[str] = None
PDF_EXTRACT_IMAGES: Optional[bool] = None PDF_EXTRACT_IMAGES: Optional[bool] = None
DATALAB_MARKER_API_KEY: Optional[str] = None DATALAB_MARKER_API_KEY: Optional[str] = None
DATALAB_MARKER_LANGS: Optional[str] = None DATALAB_MARKER_API_BASE_URL: Optional[str] = None
DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
DATALAB_MARKER_FORCE_OCR: Optional[bool] = None DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
DATALAB_MARKER_PAGINATE: Optional[bool] = None DATALAB_MARKER_PAGINATE: Optional[bool] = None
DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
DATALAB_MARKER_USE_LLM: Optional[bool] = None DATALAB_MARKER_USE_LLM: Optional[bool] = None
DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
@ -683,10 +687,15 @@ async def update_rag_config(
if form_data.DATALAB_MARKER_API_KEY is not None if form_data.DATALAB_MARKER_API_KEY is not None
else request.app.state.config.DATALAB_MARKER_API_KEY else request.app.state.config.DATALAB_MARKER_API_KEY
) )
request.app.state.config.DATALAB_MARKER_LANGS = ( request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
form_data.DATALAB_MARKER_LANGS form_data.DATALAB_MARKER_API_BASE_URL
if form_data.DATALAB_MARKER_LANGS is not None if form_data.DATALAB_MARKER_API_BASE_URL is not None
else request.app.state.config.DATALAB_MARKER_LANGS else request.app.state.config.DATALAB_MARKER_API_BASE_URL
)
request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
) )
request.app.state.config.DATALAB_MARKER_SKIP_CACHE = ( request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
form_data.DATALAB_MARKER_SKIP_CACHE form_data.DATALAB_MARKER_SKIP_CACHE
@ -713,6 +722,11 @@ async def update_rag_config(
if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
) )
request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
form_data.DATALAB_MARKER_FORMAT_LINES
if form_data.DATALAB_MARKER_FORMAT_LINES is not None
else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
)
request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = ( request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
form_data.DATALAB_MARKER_OUTPUT_FORMAT form_data.DATALAB_MARKER_OUTPUT_FORMAT
if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
@ -1006,7 +1020,8 @@ async def update_rag_config(
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@ -1393,12 +1408,14 @@ def process_file(
loader = Loader( loader = Loader(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY, DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS, DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE, DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR, DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE, DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM, DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,

View file

@ -170,6 +170,19 @@
return; return;
} }
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG &&
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== ''
) {
try {
JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG);
} catch (e) {
toast.error($i18n.t('Invalid JSON format in Additional Config'));
return;
}
}
if ( if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' && RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' || (RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
@ -195,10 +208,6 @@
ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',') ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',')
.map((ext) => ext.trim()) .map((ext) => ext.trim())
.filter((ext) => ext !== ''), .filter((ext) => ext !== ''),
DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',')
.map((code) => code.trim())
.filter((code) => code !== '')
.join(', '),
DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse( DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse(
RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}' RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}'
), ),
@ -243,6 +252,11 @@
2 2
); );
// Set default API Base URL if empty
if (!config.DATALAB_MARKER_API_BASE_URL) {
config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker';
}
RAGConfig = config; RAGConfig = config;
}); });
</script> </script>
@ -336,6 +350,21 @@
</div> </div>
</div> </div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'} {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
<div class="my-0.5 flex gap-2 pr-2">
<Tooltip
content={$i18n.t(
'API Base URL for Datalab Marker service. Defaults to: https://www.datalab.to/api/v1/marker'
)}
placement="top-start"
className="w-full"
>
<input
class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Datalab Marker API Base URL')}
bind:value={RAGConfig.DATALAB_MARKER_API_BASE_URL}
/>
</Tooltip>
</div>
<div class="my-0.5 flex gap-2 pr-2"> <div class="my-0.5 flex gap-2 pr-2">
<SensitiveInput <SensitiveInput
placeholder={$i18n.t('Enter Datalab Marker API Key')} placeholder={$i18n.t('Enter Datalab Marker API Key')}
@ -344,24 +373,33 @@
/> />
</div> </div>
<div class="flex justify-between w-full mt-2"> <div class="flex flex-col gap-2 mt-2">
<div class="text-xs font-medium"> <div class=" flex flex-col w-full justify-between">
{$i18n.t('Languages')} <div class=" mb-1 text-xs font-medium">
{$i18n.t('Additional Config')}
</div>
<div class="flex w-full items-center relative">
<Tooltip
content={$i18n.t(
'Additional configuration options for marker. This should be a JSON string with key-value pairs. For example, \'{"key": "value"}\'. Supported keys include: disable_links, keep_pageheader_in_output, keep_pagefooter_in_output, filter_blank_pages, drop_repeated_text, layout_coverage_threshold, merge_threshold, height_tolerance, gap_threshold, image_threshold, min_line_length, level_count, default_level'
)}
placement="top-start"
className="w-full"
>
<Textarea
bind:value={RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG}
placeholder={$i18n.t('Enter JSON config (e.g., {"disable_links": true})')}
/>
</Tooltip>
</div>
</div> </div>
<input
class="text-sm bg-transparent outline-hidden"
type="text"
bind:value={RAGConfig.DATALAB_MARKER_LANGS}
placeholder={$i18n.t('e.g.) en,fr,de')}
/>
</div> </div>
<div class="flex justify-between w-full mt-2"> <div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium"> <div class="self-center text-xs font-medium">
<Tooltip <Tooltip
content={$i18n.t( content={$i18n.t(
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.' 'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to False.'
)} )}
placement="top-start" placement="top-start"
> >
@ -445,6 +483,21 @@
<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} /> <Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
</div> </div>
</div> </div>
<div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium">
<Tooltip
content={$i18n.t(
'Format the lines in the output. Defaults to False. If set to True, the lines will be formatted to detect inline math and styles.'
)}
placement="top-start"
>
{$i18n.t('Format Lines')}
</Tooltip>
</div>
<div class="flex items-center">
<Switch bind:state={RAGConfig.DATALAB_MARKER_FORMAT_LINES} />
</div>
</div>
<div class="flex justify-between w-full mt-2"> <div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium"> <div class="self-center text-xs font-medium">
<Tooltip <Tooltip