diff --git a/backend/open_webui/models/files.py b/backend/open_webui/models/files.py
index 57978225d4..bf07b5f86f 100644
--- a/backend/open_webui/models/files.py
+++ b/backend/open_webui/models/files.py
@@ -130,6 +130,17 @@ class FilesTable:
except Exception:
return None
+ def get_file_by_id_and_user_id(self, id: str, user_id: str) -> Optional[FileModel]:
+ with get_db() as db:
+ try:
+ file = db.query(File).filter_by(id=id, user_id=user_id).first()
+ if file:
+ return FileModel.model_validate(file)
+ else:
+ return None
+ except Exception:
+ return None
+
def get_file_metadata_by_id(self, id: str) -> Optional[FileMetadataResponse]:
with get_db() as db:
try:
diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index 0ddf824efa..7181981604 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -1407,59 +1407,35 @@ def process_file(
form_data: ProcessFileForm,
user=Depends(get_verified_user),
):
- try:
+ if user.role == "admin":
file = Files.get_file_by_id(form_data.file_id)
+ else:
+ file = Files.get_file_by_id_and_user_id(form_data.file_id, user.id)
- collection_name = form_data.collection_name
+ if file:
+ try:
- if collection_name is None:
- collection_name = f"file-{file.id}"
+ collection_name = form_data.collection_name
- if form_data.content:
- # Update the content in the file
- # Usage: /files/{file_id}/data/content/update, /files/ (audio file upload pipeline)
+ if collection_name is None:
+ collection_name = f"file-{file.id}"
- try:
- # /files/{file_id}/data/content/update
- VECTOR_DB_CLIENT.delete_collection(collection_name=f"file-{file.id}")
- except:
- # Audio file upload pipeline
- pass
+ if form_data.content:
+ # Update the content in the file
+ # Usage: /files/{file_id}/data/content/update, /files/ (audio file upload pipeline)
- docs = [
- Document(
- page_content=form_data.content.replace("
", "\n"),
- metadata={
- **file.meta,
- "name": file.filename,
- "created_by": file.user_id,
- "file_id": file.id,
- "source": file.filename,
- },
- )
- ]
-
- text_content = form_data.content
- elif form_data.collection_name:
- # Check if the file has already been processed and save the content
- # Usage: /knowledge/{id}/file/add, /knowledge/{id}/file/update
-
- result = VECTOR_DB_CLIENT.query(
- collection_name=f"file-{file.id}", filter={"file_id": file.id}
- )
-
- if result is not None and len(result.ids[0]) > 0:
- docs = [
- Document(
- page_content=result.documents[0][idx],
- metadata=result.metadatas[0][idx],
+ try:
+ # /files/{file_id}/data/content/update
+ VECTOR_DB_CLIENT.delete_collection(
+ collection_name=f"file-{file.id}"
)
- for idx, id in enumerate(result.ids[0])
- ]
- else:
+ except:
+ # Audio file upload pipeline
+ pass
+
docs = [
Document(
- page_content=file.data.get("content", ""),
+ page_content=form_data.content.replace("
", "\n"),
metadata={
**file.meta,
"name": file.filename,
@@ -1470,149 +1446,190 @@ def process_file(
)
]
- text_content = file.data.get("content", "")
- else:
- # Process the file and save the content
- # Usage: /files/
- file_path = file.path
- if file_path:
- file_path = Storage.get_file(file_path)
- loader = Loader(
- engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
- DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
- DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
- DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
- DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
- DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
- DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
- DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
- DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
- DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
- DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
- DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
- EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
- EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
- TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
- DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
- DOCLING_PARAMS={
- "do_ocr": request.app.state.config.DOCLING_DO_OCR,
- "force_ocr": request.app.state.config.DOCLING_FORCE_OCR,
- "ocr_engine": request.app.state.config.DOCLING_OCR_ENGINE,
- "ocr_lang": request.app.state.config.DOCLING_OCR_LANG,
- "pdf_backend": request.app.state.config.DOCLING_PDF_BACKEND,
- "table_mode": request.app.state.config.DOCLING_TABLE_MODE,
- "pipeline": request.app.state.config.DOCLING_PIPELINE,
- "do_picture_description": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
- "picture_description_mode": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
- "picture_description_local": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
- "picture_description_api": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
- },
- PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
- DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
- DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
- MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
- )
- docs = loader.load(
- file.filename, file.meta.get("content_type"), file_path
+ text_content = form_data.content
+ elif form_data.collection_name:
+ # Check if the file has already been processed and save the content
+ # Usage: /knowledge/{id}/file/add, /knowledge/{id}/file/update
+
+ result = VECTOR_DB_CLIENT.query(
+ collection_name=f"file-{file.id}", filter={"file_id": file.id}
)
- docs = [
- Document(
- page_content=doc.page_content,
- metadata={
- **doc.metadata,
- "name": file.filename,
- "created_by": file.user_id,
- "file_id": file.id,
- "source": file.filename,
- },
- )
- for doc in docs
- ]
- else:
- docs = [
- Document(
- page_content=file.data.get("content", ""),
- metadata={
- **file.meta,
- "name": file.filename,
- "created_by": file.user_id,
- "file_id": file.id,
- "source": file.filename,
- },
- )
- ]
- text_content = " ".join([doc.page_content for doc in docs])
-
- log.debug(f"text_content: {text_content}")
- Files.update_file_data_by_id(
- file.id,
- {"content": text_content},
- )
- hash = calculate_sha256_string(text_content)
- Files.update_file_hash_by_id(file.id, hash)
-
- if request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
- Files.update_file_data_by_id(file.id, {"status": "completed"})
- return {
- "status": True,
- "collection_name": None,
- "filename": file.filename,
- "content": text_content,
- }
- else:
- try:
- result = save_docs_to_vector_db(
- request,
- docs=docs,
- collection_name=collection_name,
- metadata={
- "file_id": file.id,
- "name": file.filename,
- "hash": hash,
- },
- add=(True if form_data.collection_name else False),
- user=user,
- )
- log.info(f"added {len(docs)} items to collection {collection_name}")
-
- if result:
- Files.update_file_metadata_by_id(
- file.id,
- {
- "collection_name": collection_name,
- },
- )
-
- Files.update_file_data_by_id(
- file.id,
- {"status": "completed"},
- )
-
- return {
- "status": True,
- "collection_name": collection_name,
- "filename": file.filename,
- "content": text_content,
- }
+ if result is not None and len(result.ids[0]) > 0:
+ docs = [
+ Document(
+ page_content=result.documents[0][idx],
+ metadata=result.metadatas[0][idx],
+ )
+ for idx, id in enumerate(result.ids[0])
+ ]
else:
- raise Exception("Error saving document to vector database")
- except Exception as e:
- raise e
+ docs = [
+ Document(
+ page_content=file.data.get("content", ""),
+ metadata={
+ **file.meta,
+ "name": file.filename,
+ "created_by": file.user_id,
+ "file_id": file.id,
+ "source": file.filename,
+ },
+ )
+ ]
- except Exception as e:
- log.exception(e)
- if "No pandoc was found" in str(e):
- raise HTTPException(
- status_code=status.HTTP_400_BAD_REQUEST,
- detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
+ text_content = file.data.get("content", "")
+ else:
+ # Process the file and save the content
+ # Usage: /files/
+ file_path = file.path
+ if file_path:
+ file_path = Storage.get_file(file_path)
+ loader = Loader(
+ engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
+ DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
+ DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+ DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
+ DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
+ DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
+ DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
+ DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
+ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
+ DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
+ DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
+ DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
+ EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
+ EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
+ TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
+ DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
+ DOCLING_PARAMS={
+ "do_ocr": request.app.state.config.DOCLING_DO_OCR,
+ "force_ocr": request.app.state.config.DOCLING_FORCE_OCR,
+ "ocr_engine": request.app.state.config.DOCLING_OCR_ENGINE,
+ "ocr_lang": request.app.state.config.DOCLING_OCR_LANG,
+ "pdf_backend": request.app.state.config.DOCLING_PDF_BACKEND,
+ "table_mode": request.app.state.config.DOCLING_TABLE_MODE,
+ "pipeline": request.app.state.config.DOCLING_PIPELINE,
+ "do_picture_description": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
+ "picture_description_mode": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
+ "picture_description_local": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
+ "picture_description_api": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
+ },
+ PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
+ DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+ DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
+ MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
+ )
+ docs = loader.load(
+ file.filename, file.meta.get("content_type"), file_path
+ )
+
+ docs = [
+ Document(
+ page_content=doc.page_content,
+ metadata={
+ **doc.metadata,
+ "name": file.filename,
+ "created_by": file.user_id,
+ "file_id": file.id,
+ "source": file.filename,
+ },
+ )
+ for doc in docs
+ ]
+ else:
+ docs = [
+ Document(
+ page_content=file.data.get("content", ""),
+ metadata={
+ **file.meta,
+ "name": file.filename,
+ "created_by": file.user_id,
+ "file_id": file.id,
+ "source": file.filename,
+ },
+ )
+ ]
+ text_content = " ".join([doc.page_content for doc in docs])
+
+ log.debug(f"text_content: {text_content}")
+ Files.update_file_data_by_id(
+ file.id,
+ {"content": text_content},
)
- else:
- raise HTTPException(
- status_code=status.HTTP_400_BAD_REQUEST,
- detail=str(e),
+ hash = calculate_sha256_string(text_content)
+ Files.update_file_hash_by_id(file.id, hash)
+
+ if request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
+ Files.update_file_data_by_id(file.id, {"status": "completed"})
+ return {
+ "status": True,
+ "collection_name": None,
+ "filename": file.filename,
+ "content": text_content,
+ }
+ else:
+ try:
+ result = save_docs_to_vector_db(
+ request,
+ docs=docs,
+ collection_name=collection_name,
+ metadata={
+ "file_id": file.id,
+ "name": file.filename,
+ "hash": hash,
+ },
+ add=(True if form_data.collection_name else False),
+ user=user,
+ )
+ log.info(f"added {len(docs)} items to collection {collection_name}")
+
+ if result:
+ Files.update_file_metadata_by_id(
+ file.id,
+ {
+ "collection_name": collection_name,
+ },
+ )
+
+ Files.update_file_data_by_id(
+ file.id,
+ {"status": "completed"},
+ )
+
+ return {
+ "status": True,
+ "collection_name": collection_name,
+ "filename": file.filename,
+ "content": text_content,
+ }
+ else:
+ raise Exception("Error saving document to vector database")
+ except Exception as e:
+ raise e
+
+ except Exception as e:
+ log.exception(e)
+ Files.update_file_data_by_id(
+ file.id,
+ {"status": "failed"},
)
+ if "No pandoc was found" in str(e):
+ raise HTTPException(
+ status_code=status.HTTP_400_BAD_REQUEST,
+ detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
+ )
+ else:
+ raise HTTPException(
+ status_code=status.HTTP_400_BAD_REQUEST,
+ detail=str(e),
+ )
+
+ else:
+ raise HTTPException(
+ status_code=status.HTTP_404_NOT_FOUND, detail=ERROR_MESSAGES.NOT_FOUND
+ )
+
class ProcessTextForm(BaseModel):
name: str