mirror of
https://github.com/open-webui/open-webui.git
synced 2025-12-12 04:15:25 +00:00
feat: use MINERU_PARAMS json field for mineru settings
This commit is contained in:
parent
40e9d9c330
commit
288b323df8
6 changed files with 77 additions and 213 deletions
|
|
@ -2291,7 +2291,6 @@ DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
|
||||||
os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"),
|
os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"),
|
||||||
)
|
)
|
||||||
|
|
||||||
# MinerU Configuration
|
|
||||||
MINERU_API_MODE = PersistentConfig(
|
MINERU_API_MODE = PersistentConfig(
|
||||||
"MINERU_API_MODE",
|
"MINERU_API_MODE",
|
||||||
"rag.mineru_api_mode",
|
"rag.mineru_api_mode",
|
||||||
|
|
@ -2310,40 +2309,16 @@ MINERU_API_KEY = PersistentConfig(
|
||||||
os.environ.get("MINERU_API_KEY", ""),
|
os.environ.get("MINERU_API_KEY", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
MINERU_ENABLE_OCR = PersistentConfig(
|
mineru_params = os.getenv("MINERU_PARAMS", "")
|
||||||
"MINERU_ENABLE_OCR",
|
try:
|
||||||
"rag.mineru_enable_ocr",
|
mineru_params = json.loads(mineru_params)
|
||||||
os.environ.get("MINERU_ENABLE_OCR", "false").lower() == "true",
|
except json.JSONDecodeError:
|
||||||
)
|
mineru_params = {}
|
||||||
|
|
||||||
MINERU_ENABLE_FORMULA = PersistentConfig(
|
MINERU_PARAMS = PersistentConfig(
|
||||||
"MINERU_ENABLE_FORMULA",
|
"MINERU_PARAMS",
|
||||||
"rag.mineru_enable_formula",
|
"rag.mineru_params",
|
||||||
os.environ.get("MINERU_ENABLE_FORMULA", "true").lower() == "true",
|
mineru_params,
|
||||||
)
|
|
||||||
|
|
||||||
MINERU_ENABLE_TABLE = PersistentConfig(
|
|
||||||
"MINERU_ENABLE_TABLE",
|
|
||||||
"rag.mineru_enable_table",
|
|
||||||
os.environ.get("MINERU_ENABLE_TABLE", "true").lower() == "true",
|
|
||||||
)
|
|
||||||
|
|
||||||
MINERU_LANGUAGE = PersistentConfig(
|
|
||||||
"MINERU_LANGUAGE",
|
|
||||||
"rag.mineru_language",
|
|
||||||
os.environ.get("MINERU_LANGUAGE", "en"),
|
|
||||||
)
|
|
||||||
|
|
||||||
MINERU_MODEL_VERSION = PersistentConfig(
|
|
||||||
"MINERU_MODEL_VERSION",
|
|
||||||
"rag.mineru_model_version",
|
|
||||||
os.environ.get("MINERU_MODEL_VERSION", "pipeline"), # "pipeline" or "vlm"
|
|
||||||
)
|
|
||||||
|
|
||||||
MINERU_PAGE_RANGES = PersistentConfig(
|
|
||||||
"MINERU_PAGE_RANGES",
|
|
||||||
"rag.mineru_page_ranges",
|
|
||||||
os.environ.get("MINERU_PAGE_RANGES", ""),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig(
|
EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig(
|
||||||
|
|
|
||||||
|
|
@ -246,12 +246,7 @@ from open_webui.config import (
|
||||||
MINERU_API_MODE,
|
MINERU_API_MODE,
|
||||||
MINERU_API_URL,
|
MINERU_API_URL,
|
||||||
MINERU_API_KEY,
|
MINERU_API_KEY,
|
||||||
MINERU_ENABLE_OCR,
|
MINERU_PARAMS,
|
||||||
MINERU_ENABLE_FORMULA,
|
|
||||||
MINERU_ENABLE_TABLE,
|
|
||||||
MINERU_LANGUAGE,
|
|
||||||
MINERU_MODEL_VERSION,
|
|
||||||
MINERU_PAGE_RANGES,
|
|
||||||
DATALAB_MARKER_USE_LLM,
|
DATALAB_MARKER_USE_LLM,
|
||||||
EXTERNAL_DOCUMENT_LOADER_URL,
|
EXTERNAL_DOCUMENT_LOADER_URL,
|
||||||
EXTERNAL_DOCUMENT_LOADER_API_KEY,
|
EXTERNAL_DOCUMENT_LOADER_API_KEY,
|
||||||
|
|
@ -865,12 +860,7 @@ app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY
|
||||||
app.state.config.MINERU_API_MODE = MINERU_API_MODE
|
app.state.config.MINERU_API_MODE = MINERU_API_MODE
|
||||||
app.state.config.MINERU_API_URL = MINERU_API_URL
|
app.state.config.MINERU_API_URL = MINERU_API_URL
|
||||||
app.state.config.MINERU_API_KEY = MINERU_API_KEY
|
app.state.config.MINERU_API_KEY = MINERU_API_KEY
|
||||||
app.state.config.MINERU_ENABLE_OCR = MINERU_ENABLE_OCR
|
app.state.config.MINERU_PARAMS = MINERU_PARAMS
|
||||||
app.state.config.MINERU_ENABLE_FORMULA = MINERU_ENABLE_FORMULA
|
|
||||||
app.state.config.MINERU_ENABLE_TABLE = MINERU_ENABLE_TABLE
|
|
||||||
app.state.config.MINERU_LANGUAGE = MINERU_LANGUAGE
|
|
||||||
app.state.config.MINERU_MODEL_VERSION = MINERU_MODEL_VERSION
|
|
||||||
app.state.config.MINERU_PAGE_RANGES = MINERU_PAGE_RANGES
|
|
||||||
|
|
||||||
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
|
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
|
||||||
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
||||||
|
|
|
||||||
|
|
@ -382,12 +382,7 @@ class Loader:
|
||||||
api_mode=self.kwargs.get("MINERU_API_MODE", "local"),
|
api_mode=self.kwargs.get("MINERU_API_MODE", "local"),
|
||||||
api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"),
|
api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"),
|
||||||
api_key=self.kwargs.get("MINERU_API_KEY", ""),
|
api_key=self.kwargs.get("MINERU_API_KEY", ""),
|
||||||
enable_ocr=self.kwargs.get("MINERU_ENABLE_OCR", False),
|
params=self.kwargs.get("MINERU_PARAMS", {}),
|
||||||
enable_formula=self.kwargs.get("MINERU_ENABLE_FORMULA", True),
|
|
||||||
enable_table=self.kwargs.get("MINERU_ENABLE_TABLE", True),
|
|
||||||
language=self.kwargs.get("MINERU_LANGUAGE", "en"),
|
|
||||||
model_version=self.kwargs.get("MINERU_MODEL_VERSION", "pipeline"),
|
|
||||||
page_ranges=self.kwargs.get("MINERU_PAGE_RANGES", ""),
|
|
||||||
)
|
)
|
||||||
elif (
|
elif (
|
||||||
self.engine == "mistral_ocr"
|
self.engine == "mistral_ocr"
|
||||||
|
|
|
||||||
|
|
@ -25,23 +25,21 @@ class MinerULoader:
|
||||||
api_mode: str = "local",
|
api_mode: str = "local",
|
||||||
api_url: str = "http://localhost:8000",
|
api_url: str = "http://localhost:8000",
|
||||||
api_key: str = "",
|
api_key: str = "",
|
||||||
enable_ocr: bool = False,
|
params: dict = None,
|
||||||
enable_formula: bool = True,
|
|
||||||
enable_table: bool = True,
|
|
||||||
language: str = "en",
|
|
||||||
model_version: str = "pipeline",
|
|
||||||
page_ranges: str = "",
|
|
||||||
):
|
):
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.api_mode = api_mode.lower()
|
self.api_mode = api_mode.lower()
|
||||||
self.api_url = api_url.rstrip("/")
|
self.api_url = api_url.rstrip("/")
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
self.enable_ocr = enable_ocr
|
|
||||||
self.enable_formula = enable_formula
|
# Parse params dict with defaults
|
||||||
self.enable_table = enable_table
|
params = params or {}
|
||||||
self.language = language
|
self.enable_ocr = params.get("enable_ocr", False)
|
||||||
self.model_version = model_version
|
self.enable_formula = params.get("enable_formula", True)
|
||||||
self.page_ranges = page_ranges
|
self.enable_table = params.get("enable_table", True)
|
||||||
|
self.language = params.get("language", "en")
|
||||||
|
self.model_version = params.get("model_version", "pipeline")
|
||||||
|
self.page_ranges = params.get("page_ranges", "")
|
||||||
|
|
||||||
# Validate API mode
|
# Validate API mode
|
||||||
if self.api_mode not in ["local", "cloud"]:
|
if self.api_mode not in ["local", "cloud"]:
|
||||||
|
|
|
||||||
|
|
@ -470,12 +470,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||||
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
||||||
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
|
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
|
||||||
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
|
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
|
||||||
"MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR,
|
"MINERU_PARAMS": request.app.state.config.MINERU_PARAMS,
|
||||||
"MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA,
|
|
||||||
"MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE,
|
|
||||||
"MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE,
|
|
||||||
"MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION,
|
|
||||||
"MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES,
|
|
||||||
# Reranking settings
|
# Reranking settings
|
||||||
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
|
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
|
||||||
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
|
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
|
||||||
|
|
@ -661,12 +656,7 @@ class ConfigForm(BaseModel):
|
||||||
MINERU_API_MODE: Optional[str] = None
|
MINERU_API_MODE: Optional[str] = None
|
||||||
MINERU_API_URL: Optional[str] = None
|
MINERU_API_URL: Optional[str] = None
|
||||||
MINERU_API_KEY: Optional[str] = None
|
MINERU_API_KEY: Optional[str] = None
|
||||||
MINERU_ENABLE_OCR: Optional[bool] = None
|
MINERU_PARAMS: Optional[dict] = None
|
||||||
MINERU_ENABLE_FORMULA: Optional[bool] = None
|
|
||||||
MINERU_ENABLE_TABLE: Optional[bool] = None
|
|
||||||
MINERU_LANGUAGE: Optional[str] = None
|
|
||||||
MINERU_MODEL_VERSION: Optional[str] = None
|
|
||||||
MINERU_PAGE_RANGES: Optional[str] = None
|
|
||||||
|
|
||||||
# Reranking settings
|
# Reranking settings
|
||||||
RAG_RERANKING_MODEL: Optional[str] = None
|
RAG_RERANKING_MODEL: Optional[str] = None
|
||||||
|
|
@ -923,35 +913,10 @@ async def update_rag_config(
|
||||||
if form_data.MINERU_API_KEY is not None
|
if form_data.MINERU_API_KEY is not None
|
||||||
else request.app.state.config.MINERU_API_KEY
|
else request.app.state.config.MINERU_API_KEY
|
||||||
)
|
)
|
||||||
request.app.state.config.MINERU_ENABLE_OCR = (
|
request.app.state.config.MINERU_PARAMS = (
|
||||||
form_data.MINERU_ENABLE_OCR
|
form_data.MINERU_PARAMS
|
||||||
if form_data.MINERU_ENABLE_OCR is not None
|
if form_data.MINERU_PARAMS is not None
|
||||||
else request.app.state.config.MINERU_ENABLE_OCR
|
else request.app.state.config.MINERU_PARAMS
|
||||||
)
|
|
||||||
request.app.state.config.MINERU_ENABLE_FORMULA = (
|
|
||||||
form_data.MINERU_ENABLE_FORMULA
|
|
||||||
if form_data.MINERU_ENABLE_FORMULA is not None
|
|
||||||
else request.app.state.config.MINERU_ENABLE_FORMULA
|
|
||||||
)
|
|
||||||
request.app.state.config.MINERU_ENABLE_TABLE = (
|
|
||||||
form_data.MINERU_ENABLE_TABLE
|
|
||||||
if form_data.MINERU_ENABLE_TABLE is not None
|
|
||||||
else request.app.state.config.MINERU_ENABLE_TABLE
|
|
||||||
)
|
|
||||||
request.app.state.config.MINERU_LANGUAGE = (
|
|
||||||
form_data.MINERU_LANGUAGE
|
|
||||||
if form_data.MINERU_LANGUAGE is not None
|
|
||||||
else request.app.state.config.MINERU_LANGUAGE
|
|
||||||
)
|
|
||||||
request.app.state.config.MINERU_MODEL_VERSION = (
|
|
||||||
form_data.MINERU_MODEL_VERSION
|
|
||||||
if form_data.MINERU_MODEL_VERSION is not None
|
|
||||||
else request.app.state.config.MINERU_MODEL_VERSION
|
|
||||||
)
|
|
||||||
request.app.state.config.MINERU_PAGE_RANGES = (
|
|
||||||
form_data.MINERU_PAGE_RANGES
|
|
||||||
if form_data.MINERU_PAGE_RANGES is not None
|
|
||||||
else request.app.state.config.MINERU_PAGE_RANGES
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Reranking settings
|
# Reranking settings
|
||||||
|
|
@ -1222,12 +1187,7 @@ async def update_rag_config(
|
||||||
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
"MINERU_API_MODE": request.app.state.config.MINERU_API_MODE,
|
||||||
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
|
"MINERU_API_URL": request.app.state.config.MINERU_API_URL,
|
||||||
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
|
"MINERU_API_KEY": request.app.state.config.MINERU_API_KEY,
|
||||||
"MINERU_ENABLE_OCR": request.app.state.config.MINERU_ENABLE_OCR,
|
"MINERU_PARAMS": request.app.state.config.MINERU_PARAMS,
|
||||||
"MINERU_ENABLE_FORMULA": request.app.state.config.MINERU_ENABLE_FORMULA,
|
|
||||||
"MINERU_ENABLE_TABLE": request.app.state.config.MINERU_ENABLE_TABLE,
|
|
||||||
"MINERU_LANGUAGE": request.app.state.config.MINERU_LANGUAGE,
|
|
||||||
"MINERU_MODEL_VERSION": request.app.state.config.MINERU_MODEL_VERSION,
|
|
||||||
"MINERU_PAGE_RANGES": request.app.state.config.MINERU_PAGE_RANGES,
|
|
||||||
# Reranking settings
|
# Reranking settings
|
||||||
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
|
"RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
|
||||||
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
|
"RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
|
||||||
|
|
@ -1641,12 +1601,7 @@ def process_file(
|
||||||
MINERU_API_MODE=request.app.state.config.MINERU_API_MODE,
|
MINERU_API_MODE=request.app.state.config.MINERU_API_MODE,
|
||||||
MINERU_API_URL=request.app.state.config.MINERU_API_URL,
|
MINERU_API_URL=request.app.state.config.MINERU_API_URL,
|
||||||
MINERU_API_KEY=request.app.state.config.MINERU_API_KEY,
|
MINERU_API_KEY=request.app.state.config.MINERU_API_KEY,
|
||||||
MINERU_ENABLE_OCR=request.app.state.config.MINERU_ENABLE_OCR,
|
MINERU_PARAMS=request.app.state.config.MINERU_PARAMS,
|
||||||
MINERU_ENABLE_FORMULA=request.app.state.config.MINERU_ENABLE_FORMULA,
|
|
||||||
MINERU_ENABLE_TABLE=request.app.state.config.MINERU_ENABLE_TABLE,
|
|
||||||
MINERU_LANGUAGE=request.app.state.config.MINERU_LANGUAGE,
|
|
||||||
MINERU_MODEL_VERSION=request.app.state.config.MINERU_MODEL_VERSION,
|
|
||||||
MINERU_PAGE_RANGES=request.app.state.config.MINERU_PAGE_RANGES,
|
|
||||||
)
|
)
|
||||||
docs = loader.load(
|
docs = loader.load(
|
||||||
file.filename, file.meta.get("content_type"), file_path
|
file.filename, file.meta.get("content_type"), file_path
|
||||||
|
|
|
||||||
|
|
@ -812,84 +812,35 @@
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
|
|
||||||
<!-- OCR Toggle -->
|
<!-- Parameters -->
|
||||||
<div class="flex w-full mt-2">
|
<div class="flex justify-between w-full mt-2">
|
||||||
<div class="flex-1 flex justify-between">
|
|
||||||
<div class="self-center text-xs font-medium">
|
<div class="self-center text-xs font-medium">
|
||||||
{$i18n.t('Enable OCR (for scanned documents)')}
|
<Tooltip
|
||||||
</div>
|
content={$i18n.t('Advanced parameters for MinerU parsing (enable_ocr, enable_formula, enable_table, language, model_version, page_ranges)')}
|
||||||
<div class="flex items-center relative">
|
placement="top-start"
|
||||||
<Switch bind:state={RAGConfig.MINERU_ENABLE_OCR} />
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Formula Recognition -->
|
|
||||||
<div class="flex w-full mt-2">
|
|
||||||
<div class="flex-1 flex justify-between">
|
|
||||||
<div class="self-center text-xs font-medium">
|
|
||||||
{$i18n.t('Enable Formula Recognition')}
|
|
||||||
</div>
|
|
||||||
<div class="flex items-center relative">
|
|
||||||
<Switch bind:state={RAGConfig.MINERU_ENABLE_FORMULA} />
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Table Recognition -->
|
|
||||||
<div class="flex w-full mt-2">
|
|
||||||
<div class="flex-1 flex justify-between">
|
|
||||||
<div class="self-center text-xs font-medium">
|
|
||||||
{$i18n.t('Enable Table Recognition')}
|
|
||||||
</div>
|
|
||||||
<div class="flex items-center relative">
|
|
||||||
<Switch bind:state={RAGConfig.MINERU_ENABLE_TABLE} />
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Advanced Settings Toggle -->
|
|
||||||
<details class="w-full mt-2">
|
|
||||||
<summary class="text-xs font-medium cursor-pointer hover:text-gray-600 dark:hover:text-gray-300">
|
|
||||||
{$i18n.t('Advanced Settings')}
|
|
||||||
</summary>
|
|
||||||
|
|
||||||
<div class="mt-2 space-y-2 pl-2 border-l-2 border-gray-200 dark:border-gray-700">
|
|
||||||
<!-- Model Version -->
|
|
||||||
<div class="flex w-full">
|
|
||||||
<div class="flex-1 flex justify-between">
|
|
||||||
<div class="self-center text-xs font-medium">
|
|
||||||
{$i18n.t('Model Version')}
|
|
||||||
</div>
|
|
||||||
<select
|
|
||||||
class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden"
|
|
||||||
bind:value={RAGConfig.MINERU_MODEL_VERSION}
|
|
||||||
>
|
>
|
||||||
<option value="pipeline">{$i18n.t('Pipeline (Faster, CPU-friendly)')}</option>
|
{$i18n.t('Parameters')}
|
||||||
<option value="vlm">{$i18n.t('VLM (More Accurate, GPU required)')}</option>
|
</Tooltip>
|
||||||
</select>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
<div class="">
|
||||||
|
<Textarea
|
||||||
<!-- Language -->
|
value={typeof RAGConfig.MINERU_PARAMS === 'object' && RAGConfig.MINERU_PARAMS !== null && Object.keys(RAGConfig.MINERU_PARAMS).length > 0
|
||||||
<div class="flex w-full">
|
? JSON.stringify(RAGConfig.MINERU_PARAMS, null, 2)
|
||||||
<input
|
: ''}
|
||||||
class="flex-1 w-full text-xs bg-transparent outline-hidden"
|
on:input={(e) => {
|
||||||
placeholder={$i18n.t('Language: en, ch, japan, korean, etc. (default: en)')}
|
try {
|
||||||
bind:value={RAGConfig.MINERU_LANGUAGE}
|
const value = e.target.value.trim();
|
||||||
/>
|
RAGConfig.MINERU_PARAMS = value ? JSON.parse(value) : {};
|
||||||
</div>
|
} catch (err) {
|
||||||
|
// Keep the string value if JSON is invalid (user is still typing)
|
||||||
<!-- Page Ranges (Optional) -->
|
RAGConfig.MINERU_PARAMS = e.target.value;
|
||||||
<div class="flex w-full">
|
}
|
||||||
<input
|
}}
|
||||||
class="flex-1 w-full text-xs bg-transparent outline-hidden"
|
placeholder={`{\n "enable_ocr": false,\n "enable_formula": true,\n "enable_table": true,\n "language": "en",\n "model_version": "pipeline",\n "page_ranges": ""\n}`}
|
||||||
placeholder={$i18n.t('Page ranges (optional): e.g., 1-10,15,20-25')}
|
minSize={100}
|
||||||
bind:value={RAGConfig.MINERU_PAGE_RANGES}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</details>
|
|
||||||
{/if}
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue