2024-09-28 00:23:09 +00:00
import requests
import logging
2024-09-28 00:49:18 +00:00
import ftfy
2024-12-16 07:41:17 +00:00
import sys
2024-09-28 00:49:18 +00:00
2024-09-28 00:23:09 +00:00
from langchain_community . document_loaders import (
2025-02-07 12:44:47 +00:00
AzureAIDocumentIntelligenceLoader ,
2024-09-28 00:23:09 +00:00
BSHTMLLoader ,
CSVLoader ,
Docx2txtLoader ,
OutlookMessageLoader ,
PyPDFLoader ,
TextLoader ,
UnstructuredEPubLoader ,
UnstructuredExcelLoader ,
UnstructuredMarkdownLoader ,
UnstructuredPowerPointLoader ,
UnstructuredRSTLoader ,
UnstructuredXMLLoader ,
YoutubeLoader ,
)
from langchain_core . documents import Document
2025-03-22 12:44:50 +00:00
2025-05-14 18:28:40 +00:00
from open_webui . retrieval . loaders . external_document import ExternalDocumentLoader
2025-05-28 23:47:02 +00:00
2025-04-02 11:56:10 +00:00
from open_webui . retrieval . loaders . mistral import MistralLoader
2025-05-28 23:47:02 +00:00
from open_webui . retrieval . loaders . datalab_marker import DatalabMarkerLoader
2025-03-22 12:44:50 +00:00
2024-12-16 07:41:17 +00:00
from open_webui . env import SRC_LOG_LEVELS , GLOBAL_LOG_LEVEL
2024-09-28 00:23:09 +00:00
2024-12-16 02:58:26 +00:00
logging . basicConfig ( stream = sys . stdout , level = GLOBAL_LOG_LEVEL )
2024-09-28 00:23:09 +00:00
log = logging . getLogger ( __name__ )
log . setLevel ( SRC_LOG_LEVELS [ " RAG " ] )
known_source_ext = [
" go " ,
" py " ,
" java " ,
" sh " ,
" bat " ,
" ps1 " ,
" cmd " ,
" js " ,
" ts " ,
" css " ,
" cpp " ,
" hpp " ,
" h " ,
" c " ,
" cs " ,
" sql " ,
" log " ,
" ini " ,
" pl " ,
" pm " ,
" r " ,
" dart " ,
" dockerfile " ,
" env " ,
" php " ,
" hs " ,
" hsc " ,
" lua " ,
" nginxconf " ,
" conf " ,
" m " ,
" mm " ,
" plsql " ,
" perl " ,
" rb " ,
" rs " ,
" db2 " ,
" scala " ,
" bash " ,
" swift " ,
" vue " ,
" svelte " ,
" ex " ,
" exs " ,
" erl " ,
" tsx " ,
" jsx " ,
" hs " ,
" lhs " ,
2025-02-26 14:27:03 +00:00
" json " ,
2024-09-28 00:23:09 +00:00
]
class TikaLoader :
2025-05-05 19:40:34 +00:00
def __init__ ( self , url , file_path , mime_type = None , extract_images = None ) :
2024-09-28 00:23:09 +00:00
self . url = url
self . file_path = file_path
self . mime_type = mime_type
2025-05-05 20:46:32 +00:00
self . extract_images = extract_images
2025-05-05 19:40:34 +00:00
2024-09-28 00:23:09 +00:00
def load ( self ) - > list [ Document ] :
with open ( self . file_path , " rb " ) as f :
data = f . read ( )
if self . mime_type is not None :
headers = { " Content-Type " : self . mime_type }
else :
headers = { }
2025-05-05 19:40:34 +00:00
if self . extract_images == True :
2025-05-02 21:31:00 +00:00
headers [ " X-Tika-PDFextractInlineImages " ] = " true "
2025-04-20 15:36:40 +00:00
2024-09-28 00:23:09 +00:00
endpoint = self . url
if not endpoint . endswith ( " / " ) :
endpoint + = " / "
endpoint + = " tika/text "
r = requests . put ( endpoint , data = data , headers = headers )
if r . ok :
raw_metadata = r . json ( )
2025-03-25 17:53:14 +00:00
text = raw_metadata . get ( " X-TIKA:content " , " <No text content found> " ) . strip ( )
2024-09-28 00:23:09 +00:00
if " Content-Type " in raw_metadata :
headers [ " Content-Type " ] = raw_metadata [ " Content-Type " ]
2024-12-16 02:58:26 +00:00
log . debug ( " Tika extracted text: %s " , text )
2024-09-28 00:23:09 +00:00
return [ Document ( page_content = text , metadata = headers ) ]
else :
raise Exception ( f " Error calling Tika: { r . reason } " )
2025-02-14 12:08:03 +00:00
class DoclingLoader :
2025-05-14 17:26:49 +00:00
def __init__ ( self , url , file_path = None , mime_type = None , params = None ) :
2025-03-05 23:15:55 +00:00
self . url = url . rstrip ( " / " )
2025-02-14 12:08:03 +00:00
self . file_path = file_path
self . mime_type = mime_type
2025-05-14 17:26:49 +00:00
self . params = params or { }
2025-02-14 12:08:03 +00:00
def load ( self ) - > list [ Document ] :
with open ( self . file_path , " rb " ) as f :
2025-03-05 17:53:45 +00:00
files = {
" files " : (
self . file_path ,
f ,
self . mime_type or " application/octet-stream " ,
)
}
2025-02-14 12:08:03 +00:00
params = {
2025-03-05 17:53:45 +00:00
" image_export_mode " : " placeholder " ,
2025-06-04 12:13:00 +00:00
" table_mode " : " accurate "
2025-03-05 17:53:45 +00:00
}
2025-02-14 12:08:03 +00:00
2025-05-14 17:26:49 +00:00
if self . params :
2025-06-04 12:13:00 +00:00
if self . params . get ( " do_picture_description " ) :
params [ " do_picture_description " ] = self . params . get (
" do_picture_description "
2025-05-14 17:26:49 +00:00
)
if self . params . get ( " ocr_engine " ) and self . params . get ( " ocr_lang " ) :
params [ " ocr_engine " ] = self . params . get ( " ocr_engine " )
params [ " ocr_lang " ] = [
lang . strip ( )
for lang in self . params . get ( " ocr_lang " ) . split ( " , " )
if lang . strip ( )
]
2025-05-02 21:31:00 +00:00
2025-02-14 12:08:03 +00:00
endpoint = f " { self . url } /v1alpha/convert/file "
2025-03-05 23:15:55 +00:00
r = requests . post ( endpoint , files = files , data = params )
2025-02-14 12:08:03 +00:00
2025-03-05 23:15:55 +00:00
if r . ok :
result = r . json ( )
2025-02-14 12:08:03 +00:00
document_data = result . get ( " document " , { } )
text = document_data . get ( " md_content " , " <No text content found> " )
metadata = { " Content-Type " : self . mime_type } if self . mime_type else { }
2025-03-05 17:53:45 +00:00
2025-02-14 12:08:03 +00:00
log . debug ( " Docling extracted text: %s " , text )
return [ Document ( page_content = text , metadata = metadata ) ]
else :
2025-03-05 23:15:55 +00:00
error_msg = f " Error calling Docling API: { r . reason } "
if r . text :
2025-02-14 12:08:03 +00:00
try :
2025-03-05 23:15:55 +00:00
error_data = r . json ( )
2025-02-14 12:08:03 +00:00
if " detail " in error_data :
error_msg + = f " - { error_data [ ' detail ' ] } "
2025-03-05 23:15:55 +00:00
except Exception :
error_msg + = f " - { r . text } "
2025-02-14 12:08:03 +00:00
raise Exception ( f " Error calling Docling: { error_msg } " )
2024-09-28 00:23:09 +00:00
class Loader :
def __init__ ( self , engine : str = " " , * * kwargs ) :
self . engine = engine
self . kwargs = kwargs
def load (
self , filename : str , file_content_type : str , file_path : str
) - > list [ Document ] :
loader = self . _get_loader ( filename , file_content_type , file_path )
2024-09-28 00:49:18 +00:00
docs = loader . load ( )
return [
Document (
page_content = ftfy . fix_text ( doc . page_content ) , metadata = doc . metadata
)
for doc in docs
]
2024-09-28 00:23:09 +00:00
2025-04-05 16:44:08 +00:00
def _is_text_file ( self , file_ext : str , file_content_type : str ) - > bool :
return file_ext in known_source_ext or (
file_content_type and file_content_type . find ( " text/ " ) > = 0
)
2024-09-28 00:23:09 +00:00
def _get_loader ( self , filename : str , file_content_type : str , file_path : str ) :
file_ext = filename . split ( " . " ) [ - 1 ] . lower ( )
2025-05-14 18:28:40 +00:00
if (
self . engine == " external "
and self . kwargs . get ( " EXTERNAL_DOCUMENT_LOADER_URL " )
and self . kwargs . get ( " EXTERNAL_DOCUMENT_LOADER_API_KEY " )
) :
loader = ExternalDocumentLoader (
file_path = file_path ,
url = self . kwargs . get ( " EXTERNAL_DOCUMENT_LOADER_URL " ) ,
api_key = self . kwargs . get ( " EXTERNAL_DOCUMENT_LOADER_API_KEY " ) ,
mime_type = file_content_type ,
)
2025-05-20 05:40:23 +00:00
elif self . engine == " tika " and self . kwargs . get ( " TIKA_SERVER_URL " ) :
2025-04-05 16:44:08 +00:00
if self . _is_text_file ( file_ext , file_content_type ) :
2024-09-28 00:23:09 +00:00
loader = TextLoader ( file_path , autodetect_encoding = True )
else :
loader = TikaLoader (
url = self . kwargs . get ( " TIKA_SERVER_URL " ) ,
file_path = file_path ,
mime_type = file_content_type ,
2025-05-05 19:40:34 +00:00
extract_images = self . kwargs . get ( " PDF_EXTRACT_IMAGES " ) ,
2024-09-28 00:23:09 +00:00
)
2025-05-27 04:44:07 +00:00
elif (
self . engine == " datalab_marker "
and self . kwargs . get ( " DATALAB_MARKER_API_KEY " )
2025-05-28 22:36:33 +00:00
and file_ext
in [
" pdf " ,
" xls " ,
" xlsx " ,
" ods " ,
" doc " ,
" docx " ,
" odt " ,
" ppt " ,
" pptx " ,
" odp " ,
" html " ,
" epub " ,
" png " ,
" jpeg " ,
" jpg " ,
" webp " ,
" gif " ,
" tiff " ,
]
2025-05-27 04:44:07 +00:00
) :
loader = DatalabMarkerLoader (
file_path = file_path ,
api_key = self . kwargs [ " DATALAB_MARKER_API_KEY " ] ,
langs = self . kwargs . get ( " DATALAB_MARKER_LANGS " ) ,
use_llm = self . kwargs . get ( " DATALAB_MARKER_USE_LLM " , False ) ,
skip_cache = self . kwargs . get ( " DATALAB_MARKER_SKIP_CACHE " , False ) ,
force_ocr = self . kwargs . get ( " DATALAB_MARKER_FORCE_OCR " , False ) ,
paginate = self . kwargs . get ( " DATALAB_MARKER_PAGINATE " , False ) ,
2025-05-28 22:36:33 +00:00
strip_existing_ocr = self . kwargs . get (
" DATALAB_MARKER_STRIP_EXISTING_OCR " , False
) ,
disable_image_extraction = self . kwargs . get (
" DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION " , False
) ,
output_format = self . kwargs . get (
" DATALAB_MARKER_OUTPUT_FORMAT " , " markdown "
) ,
2025-05-27 04:44:07 +00:00
)
2025-03-05 22:04:34 +00:00
elif self . engine == " docling " and self . kwargs . get ( " DOCLING_SERVER_URL " ) :
2025-04-05 16:44:08 +00:00
if self . _is_text_file ( file_ext , file_content_type ) :
loader = TextLoader ( file_path , autodetect_encoding = True )
else :
loader = DoclingLoader (
url = self . kwargs . get ( " DOCLING_SERVER_URL " ) ,
file_path = file_path ,
mime_type = file_content_type ,
2025-05-14 17:26:49 +00:00
params = {
" ocr_engine " : self . kwargs . get ( " DOCLING_OCR_ENGINE " ) ,
" ocr_lang " : self . kwargs . get ( " DOCLING_OCR_LANG " ) ,
2025-06-04 12:13:00 +00:00
" do_picture_description " : self . kwargs . get (
2025-05-14 17:26:49 +00:00
" DOCLING_DO_PICTURE_DESCRIPTION "
) ,
2025-06-04 10:34:39 +00:00
" picture_description_local " : (
' { \n '
' " repo_id " : " HuggingFaceTB/SmolVLM-256M-Instruct " , \n '
2025-06-04 12:25:31 +00:00
' " prompt " : " Analyze the image and provide a comprehensive, detailed description. Identify all visible objects, their attributes, actions taking place, spatial relationships, and any contextual or inferred connections. Use clear, structured, and informative language suitable for downstream retrieval or knowledge extraction tasks. " \n '
2025-06-04 10:34:39 +00:00
' } '
)
2025-05-14 17:26:49 +00:00
} ,
2025-04-05 16:44:08 +00:00
)
2025-02-07 12:44:47 +00:00
elif (
self . engine == " document_intelligence "
and self . kwargs . get ( " DOCUMENT_INTELLIGENCE_ENDPOINT " ) != " "
and self . kwargs . get ( " DOCUMENT_INTELLIGENCE_KEY " ) != " "
and (
file_ext in [ " pdf " , " xls " , " xlsx " , " docx " , " ppt " , " pptx " ]
or file_content_type
in [
" application/vnd.ms-excel " ,
" application/vnd.openxmlformats-officedocument.spreadsheetml.sheet " ,
" application/vnd.openxmlformats-officedocument.wordprocessingml.document " ,
" application/vnd.ms-powerpoint " ,
" application/vnd.openxmlformats-officedocument.presentationml.presentation " ,
]
)
) :
loader = AzureAIDocumentIntelligenceLoader (
file_path = file_path ,
api_endpoint = self . kwargs . get ( " DOCUMENT_INTELLIGENCE_ENDPOINT " ) ,
api_key = self . kwargs . get ( " DOCUMENT_INTELLIGENCE_KEY " ) ,
)
2025-03-22 12:44:50 +00:00
elif (
self . engine == " mistral_ocr "
and self . kwargs . get ( " MISTRAL_OCR_API_KEY " ) != " "
and file_ext
in [ " pdf " ] # Mistral OCR currently only supports PDF and images
) :
loader = MistralLoader (
api_key = self . kwargs . get ( " MISTRAL_OCR_API_KEY " ) , file_path = file_path
2025-05-14 18:28:40 +00:00
)
elif (
self . engine == " external "
and self . kwargs . get ( " MISTRAL_OCR_API_KEY " ) != " "
and file_ext
in [ " pdf " ] # Mistral OCR currently only supports PDF and images
) :
loader = MistralLoader (
api_key = self . kwargs . get ( " MISTRAL_OCR_API_KEY " ) , file_path = file_path
2025-03-22 12:44:50 +00:00
)
2024-09-28 00:23:09 +00:00
else :
if file_ext == " pdf " :
loader = PyPDFLoader (
file_path , extract_images = self . kwargs . get ( " PDF_EXTRACT_IMAGES " )
)
elif file_ext == " csv " :
2025-03-29 07:44:53 +00:00
loader = CSVLoader ( file_path , autodetect_encoding = True )
2024-09-28 00:23:09 +00:00
elif file_ext == " rst " :
loader = UnstructuredRSTLoader ( file_path , mode = " elements " )
elif file_ext == " xml " :
loader = UnstructuredXMLLoader ( file_path )
elif file_ext in [ " htm " , " html " ] :
loader = BSHTMLLoader ( file_path , open_encoding = " unicode_escape " )
elif file_ext == " md " :
2024-10-28 12:50:56 +00:00
loader = TextLoader ( file_path , autodetect_encoding = True )
2024-09-28 00:23:09 +00:00
elif file_content_type == " application/epub+zip " :
loader = UnstructuredEPubLoader ( file_path )
elif (
file_content_type
== " application/vnd.openxmlformats-officedocument.wordprocessingml.document "
or file_ext == " docx "
) :
loader = Docx2txtLoader ( file_path )
elif file_content_type in [
" application/vnd.ms-excel " ,
" application/vnd.openxmlformats-officedocument.spreadsheetml.sheet " ,
] or file_ext in [ " xls " , " xlsx " ] :
loader = UnstructuredExcelLoader ( file_path )
elif file_content_type in [
" application/vnd.ms-powerpoint " ,
" application/vnd.openxmlformats-officedocument.presentationml.presentation " ,
] or file_ext in [ " ppt " , " pptx " ] :
loader = UnstructuredPowerPointLoader ( file_path )
elif file_ext == " msg " :
loader = OutlookMessageLoader ( file_path )
2025-04-05 16:44:08 +00:00
elif self . _is_text_file ( file_ext , file_content_type ) :
2024-09-28 00:23:09 +00:00
loader = TextLoader ( file_path , autodetect_encoding = True )
else :
loader = TextLoader ( file_path , autodetect_encoding = True )
return loader