feat: add mineru as document parser support with both local and managed api

This commit is contained in:
palazski 2025-10-13 21:09:52 +03:00
parent 46ae3f4f5d
commit 40e9d9c330
6 changed files with 866 additions and 0 deletions

View file

@ -2291,6 +2291,61 @@ DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"), os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"),
) )
# MinerU Configuration
MINERU_API_MODE = PersistentConfig(
"MINERU_API_MODE",
"rag.mineru_api_mode",
os.environ.get("MINERU_API_MODE", "local"), # "local" or "cloud"
)
MINERU_API_URL = PersistentConfig(
"MINERU_API_URL",
"rag.mineru_api_url",
os.environ.get("MINERU_API_URL", "http://localhost:8000"),
)
MINERU_API_KEY = PersistentConfig(
"MINERU_API_KEY",
"rag.mineru_api_key",
os.environ.get("MINERU_API_KEY", ""),
)
MINERU_ENABLE_OCR = PersistentConfig(
"MINERU_ENABLE_OCR",
"rag.mineru_enable_ocr",
os.environ.get("MINERU_ENABLE_OCR", "false").lower() == "true",
)
MINERU_ENABLE_FORMULA = PersistentConfig(
"MINERU_ENABLE_FORMULA",
"rag.mineru_enable_formula",
os.environ.get("MINERU_ENABLE_FORMULA", "true").lower() == "true",
)
MINERU_ENABLE_TABLE = PersistentConfig(
"MINERU_ENABLE_TABLE",
"rag.mineru_enable_table",
os.environ.get("MINERU_ENABLE_TABLE", "true").lower() == "true",
)
MINERU_LANGUAGE = PersistentConfig(
"MINERU_LANGUAGE",
"rag.mineru_language",
os.environ.get("MINERU_LANGUAGE", "en"),
)
MINERU_MODEL_VERSION = PersistentConfig(
"MINERU_MODEL_VERSION",
"rag.mineru_model_version",
os.environ.get("MINERU_MODEL_VERSION", "pipeline"), # "pipeline" or "vlm"
)
MINERU_PAGE_RANGES = PersistentConfig(
"MINERU_PAGE_RANGES",
"rag.mineru_page_ranges",
os.environ.get("MINERU_PAGE_RANGES", ""),
)
EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig( EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig(
"EXTERNAL_DOCUMENT_LOADER_URL", "EXTERNAL_DOCUMENT_LOADER_URL",
"rag.external_document_loader_url", "rag.external_document_loader_url",

View file

@ -243,6 +243,15 @@ from open_webui.config import (
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES, DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_OUTPUT_FORMAT,
MINERU_API_MODE,
MINERU_API_URL,
MINERU_API_KEY,
MINERU_ENABLE_OCR,
MINERU_ENABLE_FORMULA,
MINERU_ENABLE_TABLE,
MINERU_LANGUAGE,
MINERU_MODEL_VERSION,
MINERU_PAGE_RANGES,
DATALAB_MARKER_USE_LLM, DATALAB_MARKER_USE_LLM,
EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_URL,
EXTERNAL_DOCUMENT_LOADER_API_KEY, EXTERNAL_DOCUMENT_LOADER_API_KEY,
@ -853,6 +862,15 @@ app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_A
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
app.state.config.MINERU_API_MODE = MINERU_API_MODE
app.state.config.MINERU_API_URL = MINERU_API_URL
app.state.config.MINERU_API_KEY = MINERU_API_KEY
app.state.config.MINERU_ENABLE_OCR = MINERU_ENABLE_OCR
app.state.config.MINERU_ENABLE_FORMULA = MINERU_ENABLE_FORMULA
app.state.config.MINERU_ENABLE_TABLE = MINERU_ENABLE_TABLE
app.state.config.MINERU_LANGUAGE = MINERU_LANGUAGE
app.state.config.MINERU_MODEL_VERSION = MINERU_MODEL_VERSION
app.state.config.MINERU_PAGE_RANGES = MINERU_PAGE_RANGES
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME

View file

@ -27,6 +27,7 @@ from open_webui.retrieval.loaders.external_document import ExternalDocumentLoade
from open_webui.retrieval.loaders.mistral import MistralLoader from open_webui.retrieval.loaders.mistral import MistralLoader
from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader
from open_webui.retrieval.loaders.mineru import MinerULoader
from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL
@ -367,6 +368,27 @@ class Loader:
api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"), api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
azure_credential=DefaultAzureCredential(), azure_credential=DefaultAzureCredential(),
) )
elif self.engine == "mineru" and file_ext in [
"pdf",
"doc",
"docx",
"ppt",
"pptx",
"xls",
"xlsx",
]:
loader = MinerULoader(
file_path=file_path,
api_mode=self.kwargs.get("MINERU_API_MODE", "local"),
api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"),
api_key=self.kwargs.get("MINERU_API_KEY", ""),
enable_ocr=self.kwargs.get("MINERU_ENABLE_OCR", False),
enable_formula=self.kwargs.get("MINERU_ENABLE_FORMULA", True),
enable_table=self.kwargs.get("MINERU_ENABLE_TABLE", True),
language=self.kwargs.get("MINERU_LANGUAGE", "en"),
model_version=self.kwargs.get("MINERU_MODEL_VERSION", "pipeline"),
page_ranges=self.kwargs.get("MINERU_PAGE_RANGES", ""),
)
elif ( elif (
self.engine == "mistral_ocr" self.engine == "mistral_ocr"
and self.kwargs.get("MISTRAL_OCR_API_KEY") != "" and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""

View file

@ -0,0 +1,543 @@
import os
import time
import requests
import logging
import tempfile
import zipfile
from typing import List, Optional
from langchain_core.documents import Document
from fastapi import HTTPException, status
log = logging.getLogger(__name__)
class MinerULoader:
"""
MinerU document parser loader supporting both Cloud API and Local API modes.
Cloud API: Uses MinerU managed service with async task-based processing
Local API: Uses self-hosted MinerU API with synchronous processing
"""
def __init__(
self,
file_path: str,
api_mode: str = "local",
api_url: str = "http://localhost:8000",
api_key: str = "",
enable_ocr: bool = False,
enable_formula: bool = True,
enable_table: bool = True,
language: str = "en",
model_version: str = "pipeline",
page_ranges: str = "",
):
self.file_path = file_path
self.api_mode = api_mode.lower()
self.api_url = api_url.rstrip("/")
self.api_key = api_key
self.enable_ocr = enable_ocr
self.enable_formula = enable_formula
self.enable_table = enable_table
self.language = language
self.model_version = model_version
self.page_ranges = page_ranges
# Validate API mode
if self.api_mode not in ["local", "cloud"]:
raise ValueError(
f"Invalid API mode: {self.api_mode}. Must be 'local' or 'cloud'"
)
# Validate Cloud API requirements
if self.api_mode == "cloud" and not self.api_key:
raise ValueError("API key is required for Cloud API mode")
def load(self) -> List[Document]:
"""
Main entry point for loading and parsing the document.
Routes to Cloud or Local API based on api_mode.
"""
try:
if self.api_mode == "cloud":
return self._load_cloud_api()
else:
return self._load_local_api()
except Exception as e:
log.error(f"Error loading document with MinerU: {e}")
raise
def _load_local_api(self) -> List[Document]:
"""
Load document using Local API (synchronous).
Posts file to /file_parse endpoint and gets immediate response.
"""
log.info(f"Using MinerU Local API at {self.api_url}")
filename = os.path.basename(self.file_path)
# Build form data for Local API
form_data = {
"return_md": "true",
"formula_enable": str(self.enable_formula).lower(),
"table_enable": str(self.enable_table).lower(),
}
# Parse method based on OCR setting
if self.enable_ocr:
form_data["parse_method"] = "ocr"
else:
form_data["parse_method"] = "auto"
# Language configuration (Local API uses lang_list array)
if self.language:
form_data["lang_list"] = self.language
# Backend/model version (Local API uses "backend" parameter)
if self.model_version == "vlm":
form_data["backend"] = "vlm-vllm-engine"
else:
form_data["backend"] = "pipeline"
# Page ranges (Local API uses start_page_id and end_page_id)
if self.page_ranges:
# For simplicity, if page_ranges is specified, log a warning
# Full page range parsing would require parsing the string
log.warning(
f"Page ranges '{self.page_ranges}' specified but Local API uses different format. "
"Consider using start_page_id/end_page_id parameters if needed."
)
try:
with open(self.file_path, "rb") as f:
files = {"files": (filename, f, "application/octet-stream")}
log.info(f"Sending file to MinerU Local API: {filename}")
log.debug(f"Local API parameters: {form_data}")
response = requests.post(
f"{self.api_url}/file_parse",
data=form_data,
files=files,
timeout=300, # 5 minute timeout for large documents
)
response.raise_for_status()
except FileNotFoundError:
raise HTTPException(
status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}"
)
except requests.Timeout:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="MinerU Local API request timed out",
)
except requests.HTTPError as e:
error_detail = f"MinerU Local API request failed: {e}"
if e.response is not None:
try:
error_data = e.response.json()
error_detail += f" - {error_data}"
except:
error_detail += f" - {e.response.text}"
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error calling MinerU Local API: {str(e)}",
)
# Parse response
try:
result = response.json()
except ValueError as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid JSON response from MinerU Local API: {e}",
)
# Extract markdown content from response
if "results" not in result:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail="MinerU Local API response missing 'results' field",
)
results = result["results"]
if not results:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="MinerU returned empty results",
)
# Get the first (and typically only) result
file_result = list(results.values())[0]
markdown_content = file_result.get("md_content", "")
if not markdown_content:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="MinerU returned empty markdown content",
)
log.info(f"Successfully parsed document with MinerU Local API: {filename}")
# Create metadata
metadata = {
"source": filename,
"api_mode": "local",
"backend": result.get("backend", "unknown"),
"version": result.get("version", "unknown"),
}
return [Document(page_content=markdown_content, metadata=metadata)]
def _load_cloud_api(self) -> List[Document]:
"""
Load document using Cloud API (asynchronous).
Uses batch upload endpoint to avoid need for public file URLs.
"""
log.info(f"Using MinerU Cloud API at {self.api_url}")
filename = os.path.basename(self.file_path)
# Step 1: Request presigned upload URL
batch_id, upload_url = self._request_upload_url(filename)
# Step 2: Upload file to presigned URL
self._upload_to_presigned_url(upload_url)
# Step 3: Poll for results
result = self._poll_batch_status(batch_id, filename)
# Step 4: Download and extract markdown from ZIP
markdown_content = self._download_and_extract_zip(
result["full_zip_url"], filename
)
log.info(f"Successfully parsed document with MinerU Cloud API: {filename}")
# Create metadata
metadata = {
"source": filename,
"api_mode": "cloud",
"batch_id": batch_id,
}
return [Document(page_content=markdown_content, metadata=metadata)]
def _request_upload_url(self, filename: str) -> tuple:
"""
Request presigned upload URL from Cloud API.
Returns (batch_id, upload_url).
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
# Build request body
request_body = {
"enable_formula": self.enable_formula,
"enable_table": self.enable_table,
"language": self.language,
"model_version": self.model_version,
"files": [
{
"name": filename,
"is_ocr": self.enable_ocr,
}
],
}
# Add page ranges if specified
if self.page_ranges:
request_body["files"][0]["page_ranges"] = self.page_ranges
log.info(f"Requesting upload URL for: {filename}")
log.debug(f"Cloud API request body: {request_body}")
try:
response = requests.post(
f"{self.api_url}/file-urls/batch",
headers=headers,
json=request_body,
timeout=30,
)
response.raise_for_status()
except requests.HTTPError as e:
error_detail = f"Failed to request upload URL: {e}"
if e.response is not None:
try:
error_data = e.response.json()
error_detail += f" - {error_data.get('msg', error_data)}"
except:
error_detail += f" - {e.response.text}"
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error requesting upload URL: {str(e)}",
)
try:
result = response.json()
except ValueError as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid JSON response: {e}",
)
# Check for API error response
if result.get("code") != 0:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}",
)
data = result.get("data", {})
batch_id = data.get("batch_id")
file_urls = data.get("file_urls", [])
if not batch_id or not file_urls:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail="MinerU Cloud API response missing batch_id or file_urls",
)
upload_url = file_urls[0]
log.info(f"Received upload URL for batch: {batch_id}")
return batch_id, upload_url
def _upload_to_presigned_url(self, upload_url: str) -> None:
"""
Upload file to presigned URL (no authentication needed).
"""
log.info(f"Uploading file to presigned URL")
try:
with open(self.file_path, "rb") as f:
response = requests.put(
upload_url,
data=f,
timeout=300, # 5 minute timeout for large files
)
response.raise_for_status()
except FileNotFoundError:
raise HTTPException(
status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}"
)
except requests.Timeout:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="File upload to presigned URL timed out",
)
except requests.HTTPError as e:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Failed to upload file to presigned URL: {e}",
)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error uploading file: {str(e)}",
)
log.info("File uploaded successfully")
def _poll_batch_status(self, batch_id: str, filename: str) -> dict:
"""
Poll batch status until completion.
Returns the result dict for the file.
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
}
max_iterations = 300 # 10 minutes max (2 seconds per iteration)
poll_interval = 2 # seconds
log.info(f"Polling batch status: {batch_id}")
for iteration in range(max_iterations):
try:
response = requests.get(
f"{self.api_url}/extract-results/batch/{batch_id}",
headers=headers,
timeout=30,
)
response.raise_for_status()
except requests.HTTPError as e:
error_detail = f"Failed to poll batch status: {e}"
if e.response is not None:
try:
error_data = e.response.json()
error_detail += f" - {error_data.get('msg', error_data)}"
except:
error_detail += f" - {e.response.text}"
raise HTTPException(status.HTTP_400_BAD_REQUEST, detail=error_detail)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error polling batch status: {str(e)}",
)
try:
result = response.json()
except ValueError as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid JSON response while polling: {e}",
)
# Check for API error response
if result.get("code") != 0:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"MinerU Cloud API error: {result.get('msg', 'Unknown error')}",
)
data = result.get("data", {})
extract_result = data.get("extract_result", [])
# Find our file in the batch results
file_result = None
for item in extract_result:
if item.get("file_name") == filename:
file_result = item
break
if not file_result:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"File {filename} not found in batch results",
)
state = file_result.get("state")
if state == "done":
log.info(f"Processing complete for {filename}")
return file_result
elif state == "failed":
error_msg = file_result.get("err_msg", "Unknown error")
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"MinerU processing failed: {error_msg}",
)
elif state in ["waiting-file", "pending", "running", "converting"]:
# Still processing
if iteration % 10 == 0: # Log every 20 seconds
log.info(
f"Processing status: {state} (iteration {iteration + 1}/{max_iterations})"
)
time.sleep(poll_interval)
else:
log.warning(f"Unknown state: {state}")
time.sleep(poll_interval)
# Timeout
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="MinerU processing timed out after 10 minutes",
)
def _download_and_extract_zip(self, zip_url: str, filename: str) -> str:
"""
Download ZIP file from CDN and extract markdown content.
Returns the markdown content as a string.
"""
log.info(f"Downloading results from: {zip_url}")
try:
response = requests.get(zip_url, timeout=60)
response.raise_for_status()
except requests.HTTPError as e:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Failed to download results ZIP: {e}",
)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error downloading results: {str(e)}",
)
# Save ZIP to temporary file and extract
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_zip:
tmp_zip.write(response.content)
tmp_zip_path = tmp_zip.name
with tempfile.TemporaryDirectory() as tmp_dir:
# Extract ZIP
with zipfile.ZipFile(tmp_zip_path, "r") as zip_ref:
zip_ref.extractall(tmp_dir)
# Find markdown file - search recursively for any .md file
markdown_content = None
found_md_path = None
# First, list all files in the ZIP for debugging
all_files = []
for root, dirs, files in os.walk(tmp_dir):
for file in files:
full_path = os.path.join(root, file)
all_files.append(full_path)
# Look for any .md file
if file.endswith(".md"):
found_md_path = full_path
log.info(f"Found markdown file at: {full_path}")
try:
with open(full_path, "r", encoding="utf-8") as f:
markdown_content = f.read()
if (
markdown_content
): # Use the first non-empty markdown file
break
except Exception as e:
log.warning(f"Failed to read {full_path}: {e}")
if markdown_content:
break
if markdown_content is None:
log.error(f"Available files in ZIP: {all_files}")
# Try to provide more helpful error message
md_files = [f for f in all_files if f.endswith(".md")]
if md_files:
error_msg = (
f"Found .md files but couldn't read them: {md_files}"
)
else:
error_msg = (
f"No .md files found in ZIP. Available files: {all_files}"
)
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=error_msg,
)
# Clean up temporary ZIP file
os.unlink(tmp_zip_path)
except zipfile.BadZipFile as e:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Invalid ZIP file received: {e}",
)
except Exception as e:
raise HTTPException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error extracting ZIP: {str(e)}",
)
if not markdown_content:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="Extracted markdown content is empty",
)
log.info(
f"Successfully extracted markdown content ({len(markdown_content)} characters)"
)
return markdown_content

View file

@ -466,6 +466,16 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
# MinerU settings
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
"MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR,
"MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA,
"MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE,
"MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE,
"MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION,
"MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES,
# Reranking settings # Reranking settings
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL, "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE, "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
@ -647,6 +657,17 @@ class ConfigForm(BaseModel):
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
MISTRAL_OCR_API_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None
# MinerU settings
MINERU_API_MODE: Optional[str] = None
MINERU_API_URL: Optional[str] = None
MINERU_API_KEY: Optional[str] = None
MINERU_ENABLE_OCR: Optional[bool] = None
MINERU_ENABLE_FORMULA: Optional[bool] = None
MINERU_ENABLE_TABLE: Optional[bool] = None
MINERU_LANGUAGE: Optional[str] = None
MINERU_MODEL_VERSION: Optional[str] = None
MINERU_PAGE_RANGES: Optional[str] = None
# Reranking settings # Reranking settings
RAG_RERANKING_MODEL: Optional[str] = None RAG_RERANKING_MODEL: Optional[str] = None
RAG_RERANKING_ENGINE: Optional[str] = None RAG_RERANKING_ENGINE: Optional[str] = None
@ -886,6 +907,53 @@ async def update_rag_config(
else request.app.state.config.MISTRAL_OCR_API_KEY else request.app.state.config.MISTRAL_OCR_API_KEY
) )
# MinerU settings
request.app.state.config.MINERU_API_MODE = (
form_data.MINERU_API_MODE
if form_data.MINERU_API_MODE is not None
else request.app.state.config.MINERU_API_MODE
)
request.app.state.config.MINERU_API_URL = (
form_data.MINERU_API_URL
if form_data.MINERU_API_URL is not None
else request.app.state.config.MINERU_API_URL
)
request.app.state.config.MINERU_API_KEY = (
form_data.MINERU_API_KEY
if form_data.MINERU_API_KEY is not None
else request.app.state.config.MINERU_API_KEY
)
request.app.state.config.MINERU_ENABLE_OCR = (
form_data.MINERU_ENABLE_OCR
if form_data.MINERU_ENABLE_OCR is not None
else request.app.state.config.MINERU_ENABLE_OCR
)
request.app.state.config.MINERU_ENABLE_FORMULA = (
form_data.MINERU_ENABLE_FORMULA
if form_data.MINERU_ENABLE_FORMULA is not None
else request.app.state.config.MINERU_ENABLE_FORMULA
)
request.app.state.config.MINERU_ENABLE_TABLE = (
form_data.MINERU_ENABLE_TABLE
if form_data.MINERU_ENABLE_TABLE is not None
else request.app.state.config.MINERU_ENABLE_TABLE
)
request.app.state.config.MINERU_LANGUAGE = (
form_data.MINERU_LANGUAGE
if form_data.MINERU_LANGUAGE is not None
else request.app.state.config.MINERU_LANGUAGE
)
request.app.state.config.MINERU_MODEL_VERSION = (
form_data.MINERU_MODEL_VERSION
if form_data.MINERU_MODEL_VERSION is not None
else request.app.state.config.MINERU_MODEL_VERSION
)
request.app.state.config.MINERU_PAGE_RANGES = (
form_data.MINERU_PAGE_RANGES
if form_data.MINERU_PAGE_RANGES is not None
else request.app.state.config.MINERU_PAGE_RANGES
)
# Reranking settings # Reranking settings
if request.app.state.config.RAG_RERANKING_ENGINE == "": if request.app.state.config.RAG_RERANKING_ENGINE == "":
# Unloading the internal reranker and clear VRAM memory # Unloading the internal reranker and clear VRAM memory
@ -1150,6 +1218,16 @@ async def update_rag_config(
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
# MinerU settings
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
"MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR,
"MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA,
"MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE,
"MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE,
"MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION,
"MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES,
# Reranking settings # Reranking settings
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL, "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE, "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
@ -1560,6 +1638,15 @@ def process_file(
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY, MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
MINERU_API_MODE=request.app.state.config.MINERU_API_MODE,
MINERU_API_URL=request.app.state.config.MINERU_API_URL,
MINERU_API_KEY=request.app.state.config.MINERU_API_KEY,
MINERU_ENABLE_OCR=request.app.state.config.MINERU_ENABLE_OCR,
MINERU_ENABLE_FORMULA=request.app.state.config.MINERU_ENABLE_FORMULA,
MINERU_ENABLE_TABLE=request.app.state.config.MINERU_ENABLE_TABLE,
MINERU_LANGUAGE=request.app.state.config.MINERU_LANGUAGE,
MINERU_MODEL_VERSION=request.app.state.config.MINERU_MODEL_VERSION,
MINERU_PAGE_RANGES=request.app.state.config.MINERU_PAGE_RANGES,
) )
docs = loader.load( docs = loader.load(
file.filename, file.meta.get("content_type"), file_path file.filename, file.meta.get("content_type"), file_path

View file

@ -207,6 +207,15 @@
return; return;
} }
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru' &&
RAGConfig.MINERU_API_MODE === 'cloud' &&
RAGConfig.MINERU_API_KEY === ''
) {
toast.error($i18n.t('MinerU API Key required for Cloud API mode.'));
return;
}
if (!RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL) { if (!RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL) {
await embeddingModelUpdateHandler(); await embeddingModelUpdateHandler();
} }
@ -337,6 +346,7 @@
<option value="datalab_marker">{$i18n.t('Datalab Marker API')}</option> <option value="datalab_marker">{$i18n.t('Datalab Marker API')}</option>
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option> <option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
<option value="mistral_ocr">{$i18n.t('Mistral OCR')}</option> <option value="mistral_ocr">{$i18n.t('Mistral OCR')}</option>
<option value="mineru">{$i18n.t('MinerU')}</option>
</select> </select>
</div> </div>
</div> </div>
@ -749,6 +759,137 @@
bind:value={RAGConfig.MISTRAL_OCR_API_KEY} bind:value={RAGConfig.MISTRAL_OCR_API_KEY}
/> />
</div> </div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mineru'}
<!-- API Mode Selection -->
<div class="flex w-full mt-2">
<div class="flex-1 flex justify-between">
<div class="self-center text-xs font-medium">
{$i18n.t('API Mode')}
</div>
<select
class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden"
bind:value={RAGConfig.MINERU_API_MODE}
on:change={() => {
// Auto-update URL when switching modes if it's empty or matches the opposite mode's default
const cloudUrl = 'https://mineru.net/api/v4';
const localUrl = 'http://localhost:8000';
if (RAGConfig.MINERU_API_MODE === 'cloud') {
if (!RAGConfig.MINERU_API_URL || RAGConfig.MINERU_API_URL === localUrl) {
RAGConfig.MINERU_API_URL = cloudUrl;
}
} else {
if (!RAGConfig.MINERU_API_URL || RAGConfig.MINERU_API_URL === cloudUrl) {
RAGConfig.MINERU_API_URL = localUrl;
}
}
}}
>
<option value="local">{$i18n.t('Self-Hosted')}</option>
<option value="cloud">{$i18n.t('minerU managed (Cloud API)')}</option>
</select>
</div>
</div>
<!-- API URL -->
<div class="flex w-full mt-2">
<input
class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={RAGConfig.MINERU_API_MODE === 'cloud'
? $i18n.t('https://mineru.net/api/v4')
: $i18n.t('http://localhost:8000')}
bind:value={RAGConfig.MINERU_API_URL}
/>
</div>
<!-- API Key (Cloud only) -->
{#if RAGConfig.MINERU_API_MODE === 'cloud'}
<div class="flex w-full mt-2">
<SensitiveInput
placeholder={$i18n.t('Enter MinerU API Key')}
bind:value={RAGConfig.MINERU_API_KEY}
/>
</div>
{/if}
<!-- OCR Toggle -->
<div class="flex w-full mt-2">
<div class="flex-1 flex justify-between">
<div class="self-center text-xs font-medium">
{$i18n.t('Enable OCR (for scanned documents)')}
</div>
<div class="flex items-center relative">
<Switch bind:state={RAGConfig.MINERU_ENABLE_OCR} />
</div>
</div>
</div>
<!-- Formula Recognition -->
<div class="flex w-full mt-2">
<div class="flex-1 flex justify-between">
<div class="self-center text-xs font-medium">
{$i18n.t('Enable Formula Recognition')}
</div>
<div class="flex items-center relative">
<Switch bind:state={RAGConfig.MINERU_ENABLE_FORMULA} />
</div>
</div>
</div>
<!-- Table Recognition -->
<div class="flex w-full mt-2">
<div class="flex-1 flex justify-between">
<div class="self-center text-xs font-medium">
{$i18n.t('Enable Table Recognition')}
</div>
<div class="flex items-center relative">
<Switch bind:state={RAGConfig.MINERU_ENABLE_TABLE} />
</div>
</div>
</div>
<!-- Advanced Settings Toggle -->
<details class="w-full mt-2">
<summary class="text-xs font-medium cursor-pointer hover:text-gray-600 dark:hover:text-gray-300">
{$i18n.t('Advanced Settings')}
</summary>
<div class="mt-2 space-y-2 pl-2 border-l-2 border-gray-200 dark:border-gray-700">
<!-- Model Version -->
<div class="flex w-full">
<div class="flex-1 flex justify-between">
<div class="self-center text-xs font-medium">
{$i18n.t('Model Version')}
</div>
<select
class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden"
bind:value={RAGConfig.MINERU_MODEL_VERSION}
>
<option value="pipeline">{$i18n.t('Pipeline (Faster, CPU-friendly)')}</option>
<option value="vlm">{$i18n.t('VLM (More Accurate, GPU required)')}</option>
</select>
</div>
</div>
<!-- Language -->
<div class="flex w-full">
<input
class="flex-1 w-full text-xs bg-transparent outline-hidden"
placeholder={$i18n.t('Language: en, ch, japan, korean, etc. (default: en)')}
bind:value={RAGConfig.MINERU_LANGUAGE}
/>
</div>
<!-- Page Ranges (Optional) -->
<div class="flex w-full">
<input
class="flex-1 w-full text-xs bg-transparent outline-hidden"
placeholder={$i18n.t('Page ranges (optional): e.g., 1-10,15,20-25')}
bind:value={RAGConfig.MINERU_PAGE_RANGES}
/>
</div>
</div>
</details>
{/if} {/if}
</div> </div>