feat: update marker api

This commit is contained in:
Hisma 2025-07-22 20:49:28 -04:00
parent 8da08ad73a
commit f31cc07a9d
6 changed files with 174 additions and 95 deletions

View file

@ -2018,10 +2018,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig(
os.environ.get("DATALAB_MARKER_API_KEY", ""),
)
DATALAB_MARKER_LANGS = PersistentConfig(
"DATALAB_MARKER_LANGS",
"rag.datalab_marker_langs",
os.environ.get("DATALAB_MARKER_LANGS", ""),
DATALAB_MARKER_API_BASE_URL = PersistentConfig(
"DATALAB_MARKER_API_BASE_URL",
"rag.datalab_marker_api_base_url",
os.environ.get("DATALAB_MARKER_API_BASE_URL", ""),
)
DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig(
"DATALAB_MARKER_ADDITIONAL_CONFIG",
"rag.datalab_marker_additional_config",
os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""),
)
DATALAB_MARKER_USE_LLM = PersistentConfig(

View file

@ -227,7 +227,8 @@ from open_webui.config import (
CHUNK_SIZE,
CONTENT_EXTRACTION_ENGINE,
DATALAB_MARKER_API_KEY,
DATALAB_MARKER_LANGS,
DATALAB_MARKER_API_BASE_URL,
DATALAB_MARKER_ADDITIONAL_CONFIG,
DATALAB_MARKER_SKIP_CACHE,
DATALAB_MARKER_FORCE_OCR,
DATALAB_MARKER_PAGINATE,
@ -767,7 +768,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY
app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS
app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL
app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG
app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE
app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR
app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE

View file

@ -15,7 +15,8 @@ class DatalabMarkerLoader:
self,
file_path: str,
api_key: str,
langs: Optional[str] = None,
api_base_url: str,
additional_config: Optional[str] = None,
use_llm: bool = False,
skip_cache: bool = False,
force_ocr: bool = False,
@ -26,7 +27,8 @@ class DatalabMarkerLoader:
):
self.file_path = file_path
self.api_key = api_key
self.langs = langs
self.api_base_url = api_base_url
self.additional_config = additional_config
self.use_llm = use_llm
self.skip_cache = skip_cache
self.force_ocr = force_ocr
@ -60,7 +62,7 @@ class DatalabMarkerLoader:
return mime_map.get(ext, "application/octet-stream")
def check_marker_request_status(self, request_id: str) -> dict:
url = f"https://www.datalab.to/api/v1/marker/{request_id}"
url = f"{self.api_base_url}/{request_id}"
headers = {"X-Api-Key": self.api_key}
try:
response = requests.get(url, headers=headers)
@ -81,13 +83,12 @@ class DatalabMarkerLoader:
)
def load(self) -> List[Document]:
url = "https://www.datalab.to/api/v1/marker"
url = self.api_base_url
filename = os.path.basename(self.file_path)
mime_type = self._get_mime_type(filename)
headers = {"X-Api-Key": self.api_key}
form_data = {
"langs": self.langs,
"use_llm": str(self.use_llm).lower(),
"skip_cache": str(self.skip_cache).lower(),
"force_ocr": str(self.force_ocr).lower(),
@ -97,6 +98,9 @@ class DatalabMarkerLoader:
"output_format": self.output_format,
}
if self.additional_config and self.additional_config.strip():
form_data["additional_config"] = self.additional_config
log.info(
f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}"
)
@ -133,74 +137,87 @@ class DatalabMarkerLoader:
check_url = result.get("request_check_url")
request_id = result.get("request_id")
if not check_url:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned."
)
for _ in range(300): # Up to 10 minutes
time.sleep(2)
try:
poll_response = requests.get(check_url, headers=headers)
poll_response.raise_for_status()
poll_result = poll_response.json()
except (requests.HTTPError, ValueError) as e:
raw_body = poll_response.text
log.error(f"Polling error: {e}, response body: {raw_body}")
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
)
status_val = poll_result.get("status")
success_val = poll_result.get("success")
if status_val == "complete":
summary = {
k: poll_result.get(k)
for k in (
"status",
"output_format",
"success",
"error",
"page_count",
"total_cost",
# Check if this is a direct response (self-hosted) or polling response (DataLab)
if check_url:
# DataLab polling pattern
for _ in range(300): # Up to 10 minutes
time.sleep(2)
try:
poll_response = requests.get(check_url, headers=headers)
poll_response.raise_for_status()
poll_result = poll_response.json()
except (requests.HTTPError, ValueError) as e:
raw_body = poll_response.text
log.error(f"Polling error: {e}, response body: {raw_body}")
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
)
}
log.info(
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
)
break
if status_val == "failed" or success_val is False:
log.error(
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
)
error_msg = (
poll_result.get("error")
or "Marker returned failure without error message"
status_val = poll_result.get("status")
success_val = poll_result.get("success")
if status_val == "complete":
summary = {
k: poll_result.get(k)
for k in (
"status",
"output_format",
"success",
"error",
"page_count",
"total_cost",
)
}
log.info(
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
)
break
if status_val == "failed" or success_val is False:
log.error(
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
)
error_msg = (
poll_result.get("error")
or "Marker returned failure without error message"
)
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Marker processing failed: {error_msg}",
)
else:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
)
if not poll_result.get("success", False):
error_msg = poll_result.get("error") or "Unknown processing error"
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Marker processing failed: {error_msg}",
detail=f"Final processing failed: {error_msg}",
)
# DataLab format - content in format-specific fields
content_key = self.output_format.lower()
raw_content = poll_result.get(content_key)
final_result = poll_result
else:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
)
# Self-hosted direct response - content in "output" field
if "output" in result:
log.info("Self-hosted Marker returned direct response without polling")
raw_content = result.get("output")
final_result = result
else:
available_fields = list(result.keys()) if isinstance(result, dict) else "non-dict response"
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response."
)
if not poll_result.get("success", False):
error_msg = poll_result.get("error") or "Unknown processing error"
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Final processing failed: {error_msg}",
)
content_key = self.output_format.lower()
raw_content = poll_result.get(content_key)
if content_key == "json":
if self.output_format.lower() == "json":
full_text = json.dumps(raw_content, indent=2)
elif content_key in {"markdown", "html"}:
elif self.output_format.lower() in {"markdown", "html"}:
full_text = str(raw_content).strip()
else:
raise HTTPException(
@ -211,14 +228,14 @@ class DatalabMarkerLoader:
if not full_text:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="Datalab Marker returned empty content",
detail="Marker returned empty content",
)
marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
os.makedirs(marker_output_dir, exist_ok=True)
file_ext_map = {"markdown": "md", "json": "json", "html": "html"}
file_ext = file_ext_map.get(content_key, "txt")
file_ext = file_ext_map.get(self.output_format.lower(), "txt")
output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}"
output_path = os.path.join(marker_output_dir, output_filename)
@ -231,13 +248,13 @@ class DatalabMarkerLoader:
metadata = {
"source": filename,
"output_format": poll_result.get("output_format", self.output_format),
"page_count": poll_result.get("page_count", 0),
"output_format": final_result.get("output_format", self.output_format),
"page_count": final_result.get("page_count", 0),
"processed_with_llm": self.use_llm,
"request_id": request_id or "",
}
images = poll_result.get("images", {})
images = final_result.get("images", {})
if images:
metadata["image_count"] = len(images)
metadata["images"] = json.dumps(list(images.keys()))

View file

@ -281,10 +281,15 @@ class Loader:
"tiff",
]
):
api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
if not api_base_url or api_base_url.strip() == "":
api_base_url = "https://www.datalab.to/api/v1/marker"
loader = DatalabMarkerLoader(
file_path=file_path,
api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
langs=self.kwargs.get("DATALAB_MARKER_LANGS"),
api_base_url=api_base_url,
additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),

View file

@ -401,7 +401,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@ -566,7 +567,8 @@ class ConfigForm(BaseModel):
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
PDF_EXTRACT_IMAGES: Optional[bool] = None
DATALAB_MARKER_API_KEY: Optional[str] = None
DATALAB_MARKER_LANGS: Optional[str] = None
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
DATALAB_MARKER_PAGINATE: Optional[bool] = None
@ -683,10 +685,15 @@ async def update_rag_config(
if form_data.DATALAB_MARKER_API_KEY is not None
else request.app.state.config.DATALAB_MARKER_API_KEY
)
request.app.state.config.DATALAB_MARKER_LANGS = (
form_data.DATALAB_MARKER_LANGS
if form_data.DATALAB_MARKER_LANGS is not None
else request.app.state.config.DATALAB_MARKER_LANGS
request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
form_data.DATALAB_MARKER_API_BASE_URL
if form_data.DATALAB_MARKER_API_BASE_URL is not None
else request.app.state.config.DATALAB_MARKER_API_BASE_URL
)
request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
)
request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
form_data.DATALAB_MARKER_SKIP_CACHE
@ -1006,7 +1013,8 @@ async def update_rag_config(
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@ -1406,7 +1414,8 @@ def process_file(
loader = Loader(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS,
DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,

View file

@ -170,6 +170,19 @@
return;
}
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG &&
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== ''
) {
try {
JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG);
} catch (e) {
toast.error($i18n.t('Invalid JSON format in Additional Config'));
return;
}
}
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
@ -243,6 +256,11 @@
2
);
// Set default API Base URL if empty
if (!config.DATALAB_MARKER_API_BASE_URL) {
config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker';
}
RAGConfig = config;
});
</script>
@ -337,6 +355,19 @@
</div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
<div class="my-0.5 flex gap-2 pr-2">
<Tooltip
content={$i18n.t(
'API Base URL for Datalab Marker service. Defaults to: https://www.datalab.to/api/v1/marker'
)}
placement="top-start"
className="w-full"
>
<input
class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Datalab Marker API Base URL')}
bind:value={RAGConfig.DATALAB_MARKER_API_BASE_URL}
/>
</Tooltip>
<SensitiveInput
placeholder={$i18n.t('Enter Datalab Marker API Key')}
required={false}
@ -344,24 +375,33 @@
/>
</div>
<div class="flex justify-between w-full mt-2">
<div class="text-xs font-medium">
{$i18n.t('Languages')}
<div class="flex flex-col gap-2 mt-2">
<div class=" flex flex-col w-full justify-between">
<div class=" mb-1 text-xs font-medium">
{$i18n.t('Additional Config')}
</div>
<div class="flex w-full items-center relative">
<Tooltip
content={$i18n.t(
'Additional configuration options for marker. This should be a JSON string with key-value pairs. For example, \'{"key": "value"}\'. Supported keys include: disable_links, keep_pageheader_in_output, keep_pagefooter_in_output, filter_blank_pages, drop_repeated_text, layout_coverage_threshold, merge_threshold, height_tolerance, gap_threshold, image_threshold, min_line_length, level_count, default_level'
)}
placement="top-start"
className="w-full"
>
<Textarea
bind:value={RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG}
placeholder={$i18n.t('Enter JSON config (e.g., {"disable_links": true})')}
/>
</Tooltip>
</div>
</div>
<input
class="text-sm bg-transparent outline-hidden"
type="text"
bind:value={RAGConfig.DATALAB_MARKER_LANGS}
placeholder={$i18n.t('e.g.) en,fr,de')}
/>
</div>
<div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium">
<Tooltip
content={$i18n.t(
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.'
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to False.'
)}
placement="top-start"
>