mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 04:15:25 +00:00
enh: configurable mistral ocr base url
This commit is contained in:
parent
00520a9602
commit
415b93c7c3
6 changed files with 38 additions and 11 deletions
|
|
@ -2464,6 +2464,12 @@ DOCUMENT_INTELLIGENCE_KEY = PersistentConfig(
|
||||||
os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
|
os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
MISTRAL_OCR_API_BASE_URL = PersistentConfig(
|
||||||
|
"MISTRAL_OCR_API_BASE_URL",
|
||||||
|
"rag.MISTRAL_OCR_API_BASE_URL",
|
||||||
|
os.getenv("MISTRAL_OCR_API_BASE_URL", "https://api.mistral.ai/v1"),
|
||||||
|
)
|
||||||
|
|
||||||
MISTRAL_OCR_API_KEY = PersistentConfig(
|
MISTRAL_OCR_API_KEY = PersistentConfig(
|
||||||
"MISTRAL_OCR_API_KEY",
|
"MISTRAL_OCR_API_KEY",
|
||||||
"rag.mistral_ocr_api_key",
|
"rag.mistral_ocr_api_key",
|
||||||
|
|
|
||||||
|
|
@ -276,6 +276,7 @@ from open_webui.config import (
|
||||||
DOCLING_PICTURE_DESCRIPTION_API,
|
DOCLING_PICTURE_DESCRIPTION_API,
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
DOCUMENT_INTELLIGENCE_KEY,
|
DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
MISTRAL_OCR_API_BASE_URL,
|
||||||
MISTRAL_OCR_API_KEY,
|
MISTRAL_OCR_API_KEY,
|
||||||
RAG_TEXT_SPLITTER,
|
RAG_TEXT_SPLITTER,
|
||||||
TIKTOKEN_ENCODING_NAME,
|
TIKTOKEN_ENCODING_NAME,
|
||||||
|
|
@ -868,6 +869,7 @@ app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL = DOCLING_PICTURE_DESCRIPTION
|
||||||
app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_API
|
app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_API
|
||||||
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
||||||
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
||||||
|
app.state.config.MISTRAL_OCR_API_BASE_URL = MISTRAL_OCR_API_BASE_URL
|
||||||
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
||||||
app.state.config.MINERU_API_MODE = MINERU_API_MODE
|
app.state.config.MINERU_API_MODE = MINERU_API_MODE
|
||||||
app.state.config.MINERU_API_URL = MINERU_API_URL
|
app.state.config.MINERU_API_URL = MINERU_API_URL
|
||||||
|
|
|
||||||
|
|
@ -384,7 +384,9 @@ class Loader:
|
||||||
in ["pdf"] # Mistral OCR currently only supports PDF and images
|
in ["pdf"] # Mistral OCR currently only supports PDF and images
|
||||||
):
|
):
|
||||||
loader = MistralLoader(
|
loader = MistralLoader(
|
||||||
api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path
|
base_url=self.kwargs.get("MISTRAL_OCR_API_BASE_URL"),
|
||||||
|
api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"),
|
||||||
|
file_path=file_path,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if file_ext == "pdf":
|
if file_ext == "pdf":
|
||||||
|
|
|
||||||
|
|
@ -30,10 +30,9 @@ class MistralLoader:
|
||||||
- Enhanced error handling with retryable error classification
|
- Enhanced error handling with retryable error classification
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BASE_API_URL = "https://api.mistral.ai/v1"
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
base_url: str,
|
||||||
api_key: str,
|
api_key: str,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
timeout: int = 300, # 5 minutes default
|
timeout: int = 300, # 5 minutes default
|
||||||
|
|
@ -55,6 +54,9 @@ class MistralLoader:
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
raise FileNotFoundError(f"File not found at {file_path}")
|
raise FileNotFoundError(f"File not found at {file_path}")
|
||||||
|
|
||||||
|
self.base_url = (
|
||||||
|
base_url.rstrip("/") if base_url else "https://api.mistral.ai/v1"
|
||||||
|
)
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
|
@ -240,7 +242,7 @@ class MistralLoader:
|
||||||
in a context manager to minimize memory usage duration.
|
in a context manager to minimize memory usage duration.
|
||||||
"""
|
"""
|
||||||
log.info("Uploading file to Mistral API")
|
log.info("Uploading file to Mistral API")
|
||||||
url = f"{self.BASE_API_URL}/files"
|
url = f"{self.base_url}/files"
|
||||||
|
|
||||||
def upload_request():
|
def upload_request():
|
||||||
# MEMORY OPTIMIZATION: Use context manager to minimize file handle lifetime
|
# MEMORY OPTIMIZATION: Use context manager to minimize file handle lifetime
|
||||||
|
|
@ -275,7 +277,7 @@ class MistralLoader:
|
||||||
|
|
||||||
async def _upload_file_async(self, session: aiohttp.ClientSession) -> str:
|
async def _upload_file_async(self, session: aiohttp.ClientSession) -> str:
|
||||||
"""Async file upload with streaming for better memory efficiency."""
|
"""Async file upload with streaming for better memory efficiency."""
|
||||||
url = f"{self.BASE_API_URL}/files"
|
url = f"{self.base_url}/files"
|
||||||
|
|
||||||
async def upload_request():
|
async def upload_request():
|
||||||
# Create multipart writer for streaming upload
|
# Create multipart writer for streaming upload
|
||||||
|
|
@ -321,7 +323,7 @@ class MistralLoader:
|
||||||
def _get_signed_url(self, file_id: str) -> str:
|
def _get_signed_url(self, file_id: str) -> str:
|
||||||
"""Retrieves a temporary signed URL for the uploaded file (sync version)."""
|
"""Retrieves a temporary signed URL for the uploaded file (sync version)."""
|
||||||
log.info(f"Getting signed URL for file ID: {file_id}")
|
log.info(f"Getting signed URL for file ID: {file_id}")
|
||||||
url = f"{self.BASE_API_URL}/files/{file_id}/url"
|
url = f"{self.base_url}/files/{file_id}/url"
|
||||||
params = {"expiry": 1}
|
params = {"expiry": 1}
|
||||||
signed_url_headers = {**self.headers, "Accept": "application/json"}
|
signed_url_headers = {**self.headers, "Accept": "application/json"}
|
||||||
|
|
||||||
|
|
@ -346,7 +348,7 @@ class MistralLoader:
|
||||||
self, session: aiohttp.ClientSession, file_id: str
|
self, session: aiohttp.ClientSession, file_id: str
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Async signed URL retrieval."""
|
"""Async signed URL retrieval."""
|
||||||
url = f"{self.BASE_API_URL}/files/{file_id}/url"
|
url = f"{self.base_url}/files/{file_id}/url"
|
||||||
params = {"expiry": 1}
|
params = {"expiry": 1}
|
||||||
|
|
||||||
headers = {**self.headers, "Accept": "application/json"}
|
headers = {**self.headers, "Accept": "application/json"}
|
||||||
|
|
@ -373,7 +375,7 @@ class MistralLoader:
|
||||||
def _process_ocr(self, signed_url: str) -> Dict[str, Any]:
|
def _process_ocr(self, signed_url: str) -> Dict[str, Any]:
|
||||||
"""Sends the signed URL to the OCR endpoint for processing (sync version)."""
|
"""Sends the signed URL to the OCR endpoint for processing (sync version)."""
|
||||||
log.info("Processing OCR via Mistral API")
|
log.info("Processing OCR via Mistral API")
|
||||||
url = f"{self.BASE_API_URL}/ocr"
|
url = f"{self.base_url}/ocr"
|
||||||
ocr_headers = {
|
ocr_headers = {
|
||||||
**self.headers,
|
**self.headers,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
|
@ -407,7 +409,7 @@ class MistralLoader:
|
||||||
self, session: aiohttp.ClientSession, signed_url: str
|
self, session: aiohttp.ClientSession, signed_url: str
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Async OCR processing with timing metrics."""
|
"""Async OCR processing with timing metrics."""
|
||||||
url = f"{self.BASE_API_URL}/ocr"
|
url = f"{self.base_url}/ocr"
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
**self.headers,
|
**self.headers,
|
||||||
|
|
@ -446,7 +448,7 @@ class MistralLoader:
|
||||||
def _delete_file(self, file_id: str) -> None:
|
def _delete_file(self, file_id: str) -> None:
|
||||||
"""Deletes the file from Mistral storage (sync version)."""
|
"""Deletes the file from Mistral storage (sync version)."""
|
||||||
log.info(f"Deleting uploaded file ID: {file_id}")
|
log.info(f"Deleting uploaded file ID: {file_id}")
|
||||||
url = f"{self.BASE_API_URL}/files/{file_id}"
|
url = f"{self.base_url}/files/{file_id}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.delete(
|
response = requests.delete(
|
||||||
|
|
@ -467,7 +469,7 @@ class MistralLoader:
|
||||||
async def delete_request():
|
async def delete_request():
|
||||||
self._debug_log(f"Deleting file ID: {file_id}")
|
self._debug_log(f"Deleting file ID: {file_id}")
|
||||||
async with session.delete(
|
async with session.delete(
|
||||||
url=f"{self.BASE_API_URL}/files/{file_id}",
|
url=f"{self.base_url}/files/{file_id}",
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
timeout=aiohttp.ClientTimeout(
|
timeout=aiohttp.ClientTimeout(
|
||||||
total=self.cleanup_timeout
|
total=self.cleanup_timeout
|
||||||
|
|
|
||||||
|
|
@ -465,6 +465,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||||
"DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
|
"DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
|
||||||
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
"MISTRAL_OCR_API_BASE_URL": request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
||||||
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||||
# MinerU settings
|
# MinerU settings
|
||||||
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
||||||
|
|
@ -650,6 +651,7 @@ class ConfigForm(BaseModel):
|
||||||
DOCLING_PICTURE_DESCRIPTION_API: Optional[dict] = None
|
DOCLING_PICTURE_DESCRIPTION_API: Optional[dict] = None
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
|
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
|
||||||
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
|
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
|
||||||
|
MISTRAL_OCR_API_BASE_URL: Optional[str] = None
|
||||||
MISTRAL_OCR_API_KEY: Optional[str] = None
|
MISTRAL_OCR_API_KEY: Optional[str] = None
|
||||||
|
|
||||||
# MinerU settings
|
# MinerU settings
|
||||||
|
|
@ -891,6 +893,12 @@ async def update_rag_config(
|
||||||
if form_data.DOCUMENT_INTELLIGENCE_KEY is not None
|
if form_data.DOCUMENT_INTELLIGENCE_KEY is not None
|
||||||
else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY
|
else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY
|
||||||
)
|
)
|
||||||
|
|
||||||
|
request.app.state.config.MISTRAL_OCR_API_BASE_URL = (
|
||||||
|
form_data.MISTRAL_OCR_API_BASE_URL
|
||||||
|
if form_data.MISTRAL_OCR_API_BASE_URL is not None
|
||||||
|
else request.app.state.config.MISTRAL_OCR_API_BASE_URL
|
||||||
|
)
|
||||||
request.app.state.config.MISTRAL_OCR_API_KEY = (
|
request.app.state.config.MISTRAL_OCR_API_KEY = (
|
||||||
form_data.MISTRAL_OCR_API_KEY
|
form_data.MISTRAL_OCR_API_KEY
|
||||||
if form_data.MISTRAL_OCR_API_KEY is not None
|
if form_data.MISTRAL_OCR_API_KEY is not None
|
||||||
|
|
@ -1182,6 +1190,7 @@ async def update_rag_config(
|
||||||
"DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
|
"DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
|
||||||
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
"DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
"DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
"MISTRAL_OCR_API_BASE_URL": request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
||||||
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
"MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||||
# MinerU settings
|
# MinerU settings
|
||||||
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
||||||
|
|
@ -1597,6 +1606,7 @@ def process_file(
|
||||||
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
MISTRAL_OCR_API_BASE_URL=request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
||||||
MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
|
MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
|
||||||
MINERU_API_MODE=request.app.state.config.MINERU_API_MODE,
|
MINERU_API_MODE=request.app.state.config.MINERU_API_MODE,
|
||||||
MINERU_API_URL=request.app.state.config.MINERU_API_URL,
|
MINERU_API_URL=request.app.state.config.MINERU_API_URL,
|
||||||
|
|
|
||||||
|
|
@ -766,6 +766,11 @@
|
||||||
</div>
|
</div>
|
||||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mistral_ocr'}
|
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mistral_ocr'}
|
||||||
<div class="my-0.5 flex gap-2 pr-2">
|
<div class="my-0.5 flex gap-2 pr-2">
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full text-sm bg-transparent outline-hidden"
|
||||||
|
placeholder={$i18n.t('Enter Mistral API Base URL')}
|
||||||
|
bind:value={RAGConfig.MISTRAL_OCR_API_BASE_URL}
|
||||||
|
/>
|
||||||
<SensitiveInput
|
<SensitiveInput
|
||||||
placeholder={$i18n.t('Enter Mistral API Key')}
|
placeholder={$i18n.t('Enter Mistral API Key')}
|
||||||
bind:value={RAGConfig.MISTRAL_OCR_API_KEY}
|
bind:value={RAGConfig.MISTRAL_OCR_API_KEY}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue