diff --git a/backend/open_webui/migrations/versions/3e0e00844bb0_add_knowledge_file_table.py b/backend/open_webui/migrations/versions/3e0e00844bb0_add_knowledge_file_table.py new file mode 100644 index 0000000000..02909ace9d --- /dev/null +++ b/backend/open_webui/migrations/versions/3e0e00844bb0_add_knowledge_file_table.py @@ -0,0 +1,160 @@ +"""Add knowledge_file table + +Revision ID: 3e0e00844bb0 +Revises: 90ef40d4714e +Create Date: 2025-12-02 06:54:19.401334 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import inspect +import open_webui.internal.db + +import time +import json +import uuid + +# revision identifiers, used by Alembic. +revision: str = "3e0e00844bb0" +down_revision: Union[str, None] = "90ef40d4714e" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "knowledge_file", + sa.Column("id", sa.Text(), primary_key=True), + sa.Column("user_id", sa.Text(), nullable=False), + sa.Column( + "knowledge_id", + sa.Text(), + sa.ForeignKey("knowledge.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column( + "file_id", + sa.Text(), + sa.ForeignKey("file.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("created_at", sa.BigInteger(), nullable=False), + sa.Column("updated_at", sa.BigInteger(), nullable=False), + # indexes + sa.Index("ix_knowledge_file_knowledge_id", "knowledge_id"), + sa.Index("ix_knowledge_file_file_id", "file_id"), + sa.Index("ix_knowledge_file_user_id", "user_id"), + # unique constraints + sa.UniqueConstraint( + "knowledge_id", "file_id", name="uq_knowledge_file_knowledge_file" + ), # prevent duplicate entries + ) + + connection = op.get_bind() + + # 2. Read existing group with user_ids JSON column + knowledge_table = sa.Table( + "knowledge", + sa.MetaData(), + sa.Column("id", sa.Text()), + sa.Column("user_id", sa.Text()), + sa.Column("data", sa.JSON()), # JSON stored as text in SQLite + PG + ) + + results = connection.execute( + sa.select( + knowledge_table.c.id, knowledge_table.c.user_id, knowledge_table.c.data + ) + ).fetchall() + + # 3. Insert members into group_member table + kf_table = sa.Table( + "knowledge_file", + sa.MetaData(), + sa.Column("id", sa.Text()), + sa.Column("user_id", sa.Text()), + sa.Column("knowledge_id", sa.Text()), + sa.Column("file_id", sa.Text()), + sa.Column("created_at", sa.BigInteger()), + sa.Column("updated_at", sa.BigInteger()), + ) + + now = int(time.time()) + for knowledge_id, user_id, data in results: + if not data: + continue + + if isinstance(data, str): + try: + data = json.loads(data) + except Exception: + continue # skip invalid JSON + + if not isinstance(data, dict): + continue + + file_ids = data.get("file_ids", []) + + rows = [ + { + "id": str(uuid.uuid4()), + "user_id": user_id, + "knowledge_id": knowledge_id, + "file_id": file_id, + "created_at": now, + "updated_at": now, + } + for file_id in file_ids + ] + + if rows: + connection.execute(kf_table.insert(), rows) + + with op.batch_alter_table("knowledge") as batch: + batch.drop_column("data") + + +def downgrade() -> None: + # 1. Add back the old data column + op.add_column("knowledge", sa.Column("data", sa.JSON(), nullable=True)) + + connection = op.get_bind() + + # 2. Read knowledge_file entries and reconstruct data JSON + knowledge_table = sa.Table( + "knowledge", + sa.MetaData(), + sa.Column("id", sa.Text()), + sa.Column("data", sa.JSON()), + ) + + kf_table = sa.Table( + "knowledge_file", + sa.MetaData(), + sa.Column("id", sa.Text()), + sa.Column("knowledge_id", sa.Text()), + sa.Column("file_id", sa.Text()), + ) + + results = connection.execute(sa.select(knowledge_table.c.id)).fetchall() + + for (knowledge_id,) in results: + file_ids = connection.execute( + sa.select(kf_table.c.file_id).where(kf_table.c.knowledge_id == knowledge_id) + ).fetchall() + + file_ids_list = [fid for (fid,) in file_ids] + + data_json = {"file_ids": file_ids_list} + + connection.execute( + knowledge_table.update() + .where(knowledge_table.c.id == knowledge_id) + .values(data=data_json) + ) + + # 3. Drop the knowledge_file table + op.drop_table("knowledge_file") diff --git a/backend/open_webui/models/knowledge.py b/backend/open_webui/models/knowledge.py index cfef77e237..76548c8da4 100644 --- a/backend/open_webui/models/knowledge.py +++ b/backend/open_webui/models/knowledge.py @@ -7,13 +7,21 @@ import uuid from open_webui.internal.db import Base, get_db from open_webui.env import SRC_LOG_LEVELS -from open_webui.models.files import FileMetadataResponse +from open_webui.models.files import File, FileModel, FileMetadataResponse from open_webui.models.groups import Groups from open_webui.models.users import Users, UserResponse from pydantic import BaseModel, ConfigDict -from sqlalchemy import BigInteger, Column, String, Text, JSON +from sqlalchemy import ( + BigInteger, + Column, + ForeignKey, + String, + Text, + JSON, + UniqueConstraint, +) from open_webui.utils.access_control import has_access @@ -34,9 +42,7 @@ class Knowledge(Base): name = Column(Text) description = Column(Text) - data = Column(JSON, nullable=True) meta = Column(JSON, nullable=True) - access_control = Column(JSON, nullable=True) # Controls data access levels. # Defines access control rules for this entry. # - `None`: Public access, available to all users with the "user" role. @@ -67,7 +73,6 @@ class KnowledgeModel(BaseModel): name: str description: str - data: Optional[dict] = None meta: Optional[dict] = None access_control: Optional[dict] = None @@ -76,11 +81,42 @@ class KnowledgeModel(BaseModel): updated_at: int # timestamp in epoch +class KnowledgeFile(Base): + __tablename__ = "knowledge_file" + + id = Column(Text, unique=True, primary_key=True) + + knowledge_id = Column( + Text, ForeignKey("knowledge.id", ondelete="CASCADE"), nullable=False + ) + file_id = Column(Text, ForeignKey("file.id", ondelete="CASCADE"), nullable=False) + user_id = Column(Text, nullable=False) + + created_at = Column(BigInteger, nullable=False) + updated_at = Column(BigInteger, nullable=False) + + __table_args__ = ( + UniqueConstraint( + "knowledge_id", "file_id", name="uq_knowledge_file_knowledge_file" + ), + ) + + +class KnowledgeFileModel(BaseModel): + id: str + knowledge_id: str + file_id: str + user_id: str + + created_at: int # timestamp in epoch + updated_at: int # timestamp in epoch + + model_config = ConfigDict(from_attributes=True) + + #################### # Forms #################### - - class KnowledgeUserModel(KnowledgeModel): user: Optional[UserResponse] = None @@ -96,7 +132,6 @@ class KnowledgeUserResponse(KnowledgeUserModel): class KnowledgeForm(BaseModel): name: str description: str - data: Optional[dict] = None access_control: Optional[dict] = None @@ -182,6 +217,85 @@ class KnowledgeTable: except Exception: return None + def get_files_by_id(self, knowledge_id: str) -> list[FileModel]: + try: + with get_db() as db: + files = ( + db.query(File) + .join(KnowledgeFile, File.id == KnowledgeFile.file_id) + .filter(KnowledgeFile.knowledge_id == knowledge_id) + .all() + ) + return [FileModel.model_validate(file) for file in files] + except Exception: + return [] + + def get_file_metadatas_by_id(self, knowledge_id: str) -> list[FileMetadataResponse]: + try: + with get_db() as db: + files = self.get_files_by_id(knowledge_id) + return [FileMetadataResponse(**file.model_dump()) for file in files] + except Exception: + return [] + + def add_file_to_knowledge_by_id( + self, knowledge_id: str, file_id: str, user_id: str + ) -> Optional[KnowledgeFileModel]: + with get_db() as db: + knowledge_file = KnowledgeFileModel( + **{ + "id": str(uuid.uuid4()), + "knowledge_id": knowledge_id, + "file_id": file_id, + "user_id": user_id, + "created_at": int(time.time()), + "updated_at": int(time.time()), + } + ) + + try: + result = KnowledgeFile(**knowledge_file.model_dump()) + db.add(result) + db.commit() + db.refresh(result) + if result: + return KnowledgeFileModel.model_validate(result) + else: + return None + except Exception: + return None + + def remove_file_from_knowledge_by_id(self, knowledge_id: str, file_id: str) -> bool: + try: + with get_db() as db: + db.query(KnowledgeFile).filter_by( + knowledge_id=knowledge_id, file_id=file_id + ).delete() + db.commit() + return True + except Exception: + return False + + def reset_knowledge_by_id(self, id: str) -> Optional[KnowledgeModel]: + try: + with get_db() as db: + # Delete all knowledge_file entries for this knowledge_id + db.query(KnowledgeFile).filter_by(knowledge_id=id).delete() + db.commit() + + # Update the knowledge entry's updated_at timestamp + db.query(Knowledge).filter_by(id=id).update( + { + "updated_at": int(time.time()), + } + ) + db.commit() + + return self.get_knowledge_by_id(id=id) + except Exception as e: + log.exception(e) + return None + def update_knowledge_by_id( self, id: str, form_data: KnowledgeForm, overwrite: bool = False ) -> Optional[KnowledgeModel]: diff --git a/backend/open_webui/retrieval/utils.py b/backend/open_webui/retrieval/utils.py index b041a00471..711b1a8b79 100644 --- a/backend/open_webui/retrieval/utils.py +++ b/backend/open_webui/retrieval/utils.py @@ -1088,23 +1088,19 @@ async def get_sources_from_items( or knowledge_base.user_id == user.id or has_access(user.id, "read", knowledge_base.access_control) ): - - file_ids = knowledge_base.data.get("file_ids", []) + files = Knowledges.get_files_by_id(knowledge_base.id) documents = [] metadatas = [] - for file_id in file_ids: - file_object = Files.get_file_by_id(file_id) - - if file_object: - documents.append(file_object.data.get("content", "")) - metadatas.append( - { - "file_id": file_id, - "name": file_object.filename, - "source": file_object.filename, - } - ) + for file in files: + documents.append(file.data.get("content", "")) + metadatas.append( + { + "file_id": file.id, + "name": file.filename, + "source": file.filename, + } + ) query_result = { "documents": [documents], diff --git a/backend/open_webui/routers/files.py b/backend/open_webui/routers/files.py index 54084941fe..1d27c6ab38 100644 --- a/backend/open_webui/routers/files.py +++ b/backend/open_webui/routers/files.py @@ -53,6 +53,7 @@ router = APIRouter() ############################ +# TODO: Optimize this function to use the knowledge_file table for faster lookups. def has_access_to_file( file_id: Optional[str], access_type: str, user=Depends(get_verified_user) ) -> bool: diff --git a/backend/open_webui/routers/knowledge.py b/backend/open_webui/routers/knowledge.py index 46baa0eaea..3bfc961ac3 100644 --- a/backend/open_webui/routers/knowledge.py +++ b/backend/open_webui/routers/knowledge.py @@ -42,97 +42,38 @@ router = APIRouter() @router.get("/", response_model=list[KnowledgeUserResponse]) async def get_knowledge(user=Depends(get_verified_user)): + # Return knowledge bases with read access knowledge_bases = [] - if user.role == "admin" and BYPASS_ADMIN_ACCESS_CONTROL: knowledge_bases = Knowledges.get_knowledge_bases() else: knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(user.id, "read") - # Get files for each knowledge base - knowledge_with_files = [] - for knowledge_base in knowledge_bases: - files = [] - if knowledge_base.data: - files = Files.get_file_metadatas_by_ids( - knowledge_base.data.get("file_ids", []) - ) - - # Check if all files exist - if len(files) != len(knowledge_base.data.get("file_ids", [])): - missing_files = list( - set(knowledge_base.data.get("file_ids", [])) - - set([file.id for file in files]) - ) - if missing_files: - data = knowledge_base.data or {} - file_ids = data.get("file_ids", []) - - for missing_file in missing_files: - file_ids.remove(missing_file) - - data["file_ids"] = file_ids - Knowledges.update_knowledge_data_by_id( - id=knowledge_base.id, data=data - ) - - files = Files.get_file_metadatas_by_ids(file_ids) - - knowledge_with_files.append( - KnowledgeUserResponse( - **knowledge_base.model_dump(), - files=files, - ) + return [ + KnowledgeUserResponse( + **knowledge_base.model_dump(), + files=Knowledges.get_file_metadatas_by_id(knowledge_base.id), ) - - return knowledge_with_files + for knowledge_base in knowledge_bases + ] @router.get("/list", response_model=list[KnowledgeUserResponse]) async def get_knowledge_list(user=Depends(get_verified_user)): + # Return knowledge bases with write access knowledge_bases = [] - if user.role == "admin" and BYPASS_ADMIN_ACCESS_CONTROL: knowledge_bases = Knowledges.get_knowledge_bases() else: knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(user.id, "write") - # Get files for each knowledge base - knowledge_with_files = [] - for knowledge_base in knowledge_bases: - files = [] - if knowledge_base.data: - files = Files.get_file_metadatas_by_ids( - knowledge_base.data.get("file_ids", []) - ) - - # Check if all files exist - if len(files) != len(knowledge_base.data.get("file_ids", [])): - missing_files = list( - set(knowledge_base.data.get("file_ids", [])) - - set([file.id for file in files]) - ) - if missing_files: - data = knowledge_base.data or {} - file_ids = data.get("file_ids", []) - - for missing_file in missing_files: - file_ids.remove(missing_file) - - data["file_ids"] = file_ids - Knowledges.update_knowledge_data_by_id( - id=knowledge_base.id, data=data - ) - - files = Files.get_file_metadatas_by_ids(file_ids) - - knowledge_with_files.append( - KnowledgeUserResponse( - **knowledge_base.model_dump(), - files=files, - ) + return [ + KnowledgeUserResponse( + **knowledge_base.model_dump(), + files=Knowledges.get_file_metadatas_by_id(knowledge_base.id), ) - return knowledge_with_files + for knowledge_base in knowledge_bases + ] ############################ @@ -192,26 +133,9 @@ async def reindex_knowledge_files(request: Request, user=Depends(get_verified_us log.info(f"Starting reindexing for {len(knowledge_bases)} knowledge bases") - deleted_knowledge_bases = [] - for knowledge_base in knowledge_bases: - # -- Robust error handling for missing or invalid data - if not knowledge_base.data or not isinstance(knowledge_base.data, dict): - log.warning( - f"Knowledge base {knowledge_base.id} has no data or invalid data ({knowledge_base.data!r}). Deleting." - ) - try: - Knowledges.delete_knowledge_by_id(id=knowledge_base.id) - deleted_knowledge_bases.append(knowledge_base.id) - except Exception as e: - log.error( - f"Failed to delete invalid knowledge base {knowledge_base.id}: {e}" - ) - continue - try: - file_ids = knowledge_base.data.get("file_ids", []) - files = Files.get_files_by_ids(file_ids) + files = Knowledges.get_files_by_id(knowledge_base.id) try: if VECTOR_DB_CLIENT.has_collection(collection_name=knowledge_base.id): VECTOR_DB_CLIENT.delete_collection( @@ -251,9 +175,7 @@ async def reindex_knowledge_files(request: Request, user=Depends(get_verified_us for failed in failed_files: log.warning(f"File ID: {failed['file_id']}, Error: {failed['error']}") - log.info( - f"Reindexing completed. Deleted {len(deleted_knowledge_bases)} invalid knowledge bases: {deleted_knowledge_bases}" - ) + log.info(f"Reindexing completed.") return True @@ -271,19 +193,15 @@ async def get_knowledge_by_id(id: str, user=Depends(get_verified_user)): knowledge = Knowledges.get_knowledge_by_id(id=id) if knowledge: - if ( user.role == "admin" or knowledge.user_id == user.id or has_access(user.id, "read", knowledge.access_control) ): - file_ids = knowledge.data.get("file_ids", []) if knowledge.data else [] - files = Files.get_file_metadatas_by_ids(file_ids) - return KnowledgeFilesResponse( **knowledge.model_dump(), - files=files, + files=Knowledges.get_file_metadatas_by_id(knowledge.id), ) else: raise HTTPException( @@ -335,12 +253,9 @@ async def update_knowledge_by_id( knowledge = Knowledges.update_knowledge_by_id(id=id, form_data=form_data) if knowledge: - file_ids = knowledge.data.get("file_ids", []) if knowledge.data else [] - files = Files.get_file_metadatas_by_ids(file_ids) - return KnowledgeFilesResponse( **knowledge.model_dump(), - files=files, + files=Knowledges.get_file_metadatas_by_id(knowledge.id), ) else: raise HTTPException( @@ -366,7 +281,6 @@ def add_file_to_knowledge_by_id( user=Depends(get_verified_user), ): knowledge = Knowledges.get_knowledge_by_id(id=id) - if not knowledge: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -395,6 +309,11 @@ def add_file_to_knowledge_by_id( detail=ERROR_MESSAGES.FILE_NOT_PROCESSED, ) + # Add file to knowledge base + Knowledges.add_file_to_knowledge_by_id( + knowledge_id=id, file_id=form_data.file_id, user_id=user.id + ) + # Add content to the vector database try: process_file( @@ -410,32 +329,10 @@ def add_file_to_knowledge_by_id( ) if knowledge: - data = knowledge.data or {} - file_ids = data.get("file_ids", []) - - if form_data.file_id not in file_ids: - file_ids.append(form_data.file_id) - data["file_ids"] = file_ids - - knowledge = Knowledges.update_knowledge_data_by_id(id=id, data=data) - - if knowledge: - files = Files.get_file_metadatas_by_ids(file_ids) - - return KnowledgeFilesResponse( - **knowledge.model_dump(), - files=files, - ) - else: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT("knowledge"), - ) - else: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT("file_id"), - ) + return KnowledgeFilesResponse( + **knowledge.model_dump(), + files=Knowledges.get_file_metadatas_by_id(knowledge.id), + ) else: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -494,14 +391,9 @@ def update_file_from_knowledge_by_id( ) if knowledge: - data = knowledge.data or {} - file_ids = data.get("file_ids", []) - - files = Files.get_file_metadatas_by_ids(file_ids) - return KnowledgeFilesResponse( **knowledge.model_dump(), - files=files, + files=Knowledges.get_file_metadatas_by_id(knowledge.id), ) else: raise HTTPException( @@ -546,6 +438,10 @@ def remove_file_from_knowledge_by_id( detail=ERROR_MESSAGES.NOT_FOUND, ) + Knowledges.remove_file_from_knowledge_by_id( + knowledge_id=id, file_id=form_data.file_id + ) + # Remove content from the vector database try: VECTOR_DB_CLIENT.delete( @@ -575,31 +471,10 @@ def remove_file_from_knowledge_by_id( Files.delete_file_by_id(form_data.file_id) if knowledge: - data = knowledge.data or {} - file_ids = data.get("file_ids", []) - - if form_data.file_id in file_ids: - file_ids.remove(form_data.file_id) - data["file_ids"] = file_ids - - knowledge = Knowledges.update_knowledge_data_by_id(id=id, data=data) - if knowledge: - files = Files.get_file_metadatas_by_ids(file_ids) - - return KnowledgeFilesResponse( - **knowledge.model_dump(), - files=files, - ) - else: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT("knowledge"), - ) - else: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT("file_id"), - ) + return KnowledgeFilesResponse( + **knowledge.model_dump(), + files=Knowledges.get_file_metadatas_by_id(knowledge.id), + ) else: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -700,8 +575,7 @@ async def reset_knowledge_by_id(id: str, user=Depends(get_verified_user)): log.debug(e) pass - knowledge = Knowledges.update_knowledge_data_by_id(id=id, data={"file_ids": []}) - + knowledge = Knowledges.reset_knowledge_by_id(id=id) return knowledge @@ -762,25 +636,19 @@ async def add_files_to_knowledge_batch( ) raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) - # Add successful files to knowledge base - data = knowledge.data or {} - existing_file_ids = data.get("file_ids", []) - # Only add files that were successfully processed successful_file_ids = [r.file_id for r in result.results if r.status == "completed"] for file_id in successful_file_ids: - if file_id not in existing_file_ids: - existing_file_ids.append(file_id) - - data["file_ids"] = existing_file_ids - knowledge = Knowledges.update_knowledge_data_by_id(id=id, data=data) + Knowledges.add_file_to_knowledge_by_id( + knowledge_id=id, file_id=file_id, user_id=user.id + ) # If there were any errors, include them in the response if result.errors: error_details = [f"{err.file_id}: {err.error}" for err in result.errors] return KnowledgeFilesResponse( **knowledge.model_dump(), - files=Files.get_file_metadatas_by_ids(existing_file_ids), + files=Knowledges.get_file_metadatas_by_id(knowledge.id), warnings={ "message": "Some files failed to process", "errors": error_details, @@ -789,5 +657,5 @@ async def add_files_to_knowledge_batch( return KnowledgeFilesResponse( **knowledge.model_dump(), - files=Files.get_file_metadatas_by_ids(existing_file_ids), + files=Knowledges.get_file_metadatas_by_id(knowledge.id), )