open-webui/backend/open_webui/retrieval/loaders/main.py

import requests
import logging
import ftfy
import sys

from langchain_community.document_loaders import (
    AzureAIDocumentIntelligenceLoader,
    BSHTMLLoader,
    CSVLoader,
    Docx2txtLoader,
    OutlookMessageLoader,
    PyPDFLoader,
    TextLoader,
    UnstructuredEPubLoader,
    UnstructuredExcelLoader,
    UnstructuredMarkdownLoader,
    UnstructuredPowerPointLoader,
    UnstructuredRSTLoader,
    UnstructuredXMLLoader,
    YoutubeLoader,
)
from langchain_core.documents import Document

from open_webui.retrieval.loaders.external_document import ExternalDocumentLoader

from open_webui.retrieval.loaders.mistral import MistralLoader
from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader


from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL

logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL)
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])

known_source_ext = [
    "go",
    "py",
    "java",
    "sh",
    "bat",
    "ps1",
    "cmd",
    "js",
    "ts",
    "css",
    "cpp",
    "hpp",
    "h",
    "c",
    "cs",
    "sql",
    "log",
    "ini",
    "pl",
    "pm",
    "r",
    "dart",
    "dockerfile",
    "env",
    "php",
    "hs",
    "hsc",
    "lua",
    "nginxconf",
    "conf",
    "m",
    "mm",
    "plsql",
    "perl",
    "rb",
    "rs",
    "db2",
    "scala",
    "bash",
    "swift",
    "vue",
    "svelte",
    "ex",
    "exs",
    "erl",
    "tsx",
    "jsx",
    "hs",
    "lhs",
    "json",
]


class TikaLoader:
    def __init__(self, url, file_path, mime_type=None, extract_images=None):
        self.url = url
        self.file_path = file_path
        self.mime_type = mime_type

        self.extract_images = extract_images

    def load(self) -> list[Document]:
        with open(self.file_path, "rb") as f:
            data = f.read()

        if self.mime_type is not None:
            headers = {"Content-Type": self.mime_type}
        else:
            headers = {}

        if self.extract_images == True:
            headers["X-Tika-PDFextractInlineImages"] = "true"

        endpoint = self.url
        if not endpoint.endswith("/"):
            endpoint += "/"
        endpoint += "tika/text"

        r = requests.put(endpoint, data=data, headers=headers)

        if r.ok:
            raw_metadata = r.json()
            text = raw_metadata.get("X-TIKA:content", "<No text content found>").strip()

            if "Content-Type" in raw_metadata:
                headers["Content-Type"] = raw_metadata["Content-Type"]

            log.debug("Tika extracted text: %s", text)

            return [Document(page_content=text, metadata=headers)]
        else:
            raise Exception(f"Error calling Tika: {r.reason}")


class DoclingLoader:
    def __init__(self, url, file_path=None, mime_type=None, params=None):
        self.url = url.rstrip("/")
        self.file_path = file_path
        self.mime_type = mime_type

        self.params = params or {}

    def load(self) -> list[Document]:
        with open(self.file_path, "rb") as f:
            files = {
                "files": (
                    self.file_path,
                    f,
                    self.mime_type or "application/octet-stream",
                )
            }

            params = {
                "image_export_mode": "placeholder",
                "table_mode": "accurate"
            }

            if self.params:
                if self.params.get("do_picture_description"):
                    params["do_picture_description"] = self.params.get(
                        "do_picture_description"
                    )

                if self.params.get("ocr_engine") and self.params.get("ocr_lang"):
                    params["ocr_engine"] = self.params.get("ocr_engine")
                    params["ocr_lang"] = [
                        lang.strip()
                        for lang in self.params.get("ocr_lang").split(",")
                        if lang.strip()
                    ]

            endpoint = f"{self.url}/v1alpha/convert/file"
            r = requests.post(endpoint, files=files, data=params)

        if r.ok:
            result = r.json()
            document_data = result.get("document", {})
            text = document_data.get("md_content", "<No text content found>")

            metadata = {"Content-Type": self.mime_type} if self.mime_type else {}

            log.debug("Docling extracted text: %s", text)

            return [Document(page_content=text, metadata=metadata)]
        else:
            error_msg = f"Error calling Docling API: {r.reason}"
            if r.text:
                try:
                    error_data = r.json()
                    if "detail" in error_data:
                        error_msg += f" - {error_data['detail']}"
                except Exception:
                    error_msg += f" - {r.text}"
            raise Exception(f"Error calling Docling: {error_msg}")


class Loader:
    def __init__(self, engine: str = "", **kwargs):
        self.engine = engine
        self.kwargs = kwargs

    def load(
        self, filename: str, file_content_type: str, file_path: str
    ) -> list[Document]:
        loader = self._get_loader(filename, file_content_type, file_path)
        docs = loader.load()

        return [
            Document(
                page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata
            )
            for doc in docs
        ]

    def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
        return file_ext in known_source_ext or (
            file_content_type and file_content_type.find("text/") >= 0
        )

    def _get_loader(self, filename: str, file_content_type: str, file_path: str):
        file_ext = filename.split(".")[-1].lower()

        if (
            self.engine == "external"
            and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL")
            and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY")
        ):
            loader = ExternalDocumentLoader(
                file_path=file_path,
                url=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL"),
                api_key=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY"),
                mime_type=file_content_type,
            )
        elif self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
            if self._is_text_file(file_ext, file_content_type):
                loader = TextLoader(file_path, autodetect_encoding=True)
            else:
                loader = TikaLoader(
                    url=self.kwargs.get("TIKA_SERVER_URL"),
                    file_path=file_path,
                    mime_type=file_content_type,
                    extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
                )
        elif (
            self.engine == "datalab_marker"
            and self.kwargs.get("DATALAB_MARKER_API_KEY")
            and file_ext
            in [
                "pdf",
                "xls",
                "xlsx",
                "ods",
                "doc",
                "docx",
                "odt",
                "ppt",
                "pptx",
                "odp",
                "html",
                "epub",
                "png",
                "jpeg",
                "jpg",
                "webp",
                "gif",
                "tiff",
            ]
        ):
            loader = DatalabMarkerLoader(
                file_path=file_path,
                api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
                langs=self.kwargs.get("DATALAB_MARKER_LANGS"),
                use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
                skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
                force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),
                paginate=self.kwargs.get("DATALAB_MARKER_PAGINATE", False),
                strip_existing_ocr=self.kwargs.get(
                    "DATALAB_MARKER_STRIP_EXISTING_OCR", False
                ),
                disable_image_extraction=self.kwargs.get(
                    "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
                ),
                output_format=self.kwargs.get(
                    "DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
                ),
            )
        elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
            if self._is_text_file(file_ext, file_content_type):
                loader = TextLoader(file_path, autodetect_encoding=True)
            else:
                loader = DoclingLoader(
                    url=self.kwargs.get("DOCLING_SERVER_URL"),
                    file_path=file_path,
                    mime_type=file_content_type,
                    params={
                        "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"),
                        "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"),
                        "do_picture_description": self.kwargs.get(
                            "DOCLING_DO_PICTURE_DESCRIPTION"
                        ),
                        "picture_description_local": (
                            '{\n'
                            '    "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",\n'
                            '    "prompt": "Analyze the image and provide a comprehensive, detailed description. Identify all visible objects, their attributes, actions taking place, spatial relationships, and any contextual or inferred connections. Use clear, structured, and informative language suitable for downstream retrieval or knowledge extraction tasks."\n'
                            '}'
                        )
                    },
                )
        elif (
            self.engine == "document_intelligence"
            and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
            and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
            and (
                file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
                or file_content_type
                in [
                    "application/vnd.ms-excel",
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    "application/vnd.ms-powerpoint",
                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                ]
            )
        ):
            loader = AzureAIDocumentIntelligenceLoader(
                file_path=file_path,
                api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
                api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
            )
        elif (
            self.engine == "mistral_ocr"
            and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""
            and file_ext
            in ["pdf"]  # Mistral OCR currently only supports PDF and images
        ):
            loader = MistralLoader(
                api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path
            )
        elif (
            self.engine == "external"
            and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""
            and file_ext
            in ["pdf"]  # Mistral OCR currently only supports PDF and images
        ):
            loader = MistralLoader(
                api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path
            )
        else:
            if file_ext == "pdf":
                loader = PyPDFLoader(
                    file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
                )
            elif file_ext == "csv":
                loader = CSVLoader(file_path, autodetect_encoding=True)
            elif file_ext == "rst":
                loader = UnstructuredRSTLoader(file_path, mode="elements")
            elif file_ext == "xml":
                loader = UnstructuredXMLLoader(file_path)
            elif file_ext in ["htm", "html"]:
                loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
            elif file_ext == "md":
                loader = TextLoader(file_path, autodetect_encoding=True)
            elif file_content_type == "application/epub+zip":
                loader = UnstructuredEPubLoader(file_path)
            elif (
                file_content_type
                == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                or file_ext == "docx"
            ):
                loader = Docx2txtLoader(file_path)
            elif file_content_type in [
                "application/vnd.ms-excel",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            ] or file_ext in ["xls", "xlsx"]:
                loader = UnstructuredExcelLoader(file_path)
            elif file_content_type in [
                "application/vnd.ms-powerpoint",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            ] or file_ext in ["ppt", "pptx"]:
                loader = UnstructuredPowerPointLoader(file_path)
            elif file_ext == "msg":
                loader = OutlookMessageLoader(file_path)
            elif self._is_text_file(file_ext, file_content_type):
                loader = TextLoader(file_path, autodetect_encoding=True)
            else:
                loader = TextLoader(file_path, autodetect_encoding=True)

        return loader
refac 2024-09-28 00:23:09 +00:00			`import requests`
			`import logging`
refac 2024-09-28 00:49:18 +00:00			`import ftfy`
fix 2024-12-16 07:41:17 +00:00			`import sys`
refac 2024-09-28 00:49:18 +00:00
refac 2024-09-28 00:23:09 +00:00			`from langchain_community.document_loaders import (`
feat: Implement Document Intelligence as Content Extraction Engine 2025-02-07 12:44:47 +00:00			`AzureAIDocumentIntelligenceLoader,`
refac 2024-09-28 00:23:09 +00:00			`BSHTMLLoader,`
			`CSVLoader,`
			`Docx2txtLoader,`
			`OutlookMessageLoader,`
			`PyPDFLoader,`
			`TextLoader,`
			`UnstructuredEPubLoader,`
			`UnstructuredExcelLoader,`
			`UnstructuredMarkdownLoader,`
			`UnstructuredPowerPointLoader,`
			`UnstructuredRSTLoader,`
			`UnstructuredXMLLoader,`
			`YoutubeLoader,`
			`)`
			`from langchain_core.documents import Document`
Add Mistral OCR integration and configuration support 2025-03-22 12:44:50 +00:00
feat: external document loader support 2025-05-14 18:28:40 +00:00			`from open_webui.retrieval.loaders.external_document import ExternalDocumentLoader`
refac: PLEASE FOLLOW EXISTING CONVENTION 2025-05-28 23:47:02 +00:00
refactor: update import path for MistralLoader 2025-04-02 11:56:10 +00:00			`from open_webui.retrieval.loaders.mistral import MistralLoader`
refac: PLEASE FOLLOW EXISTING CONVENTION 2025-05-28 23:47:02 +00:00			`from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader`

Add Mistral OCR integration and configuration support 2025-03-22 12:44:50 +00:00
fix 2024-12-16 07:41:17 +00:00			`from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL`
refac 2024-09-28 00:23:09 +00:00
Only log file contents in debug 2024-12-16 02:58:26 +00:00			`logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL)`
refac 2024-09-28 00:23:09 +00:00			`log = logging.getLogger(__name__)`
			`log.setLevel(SRC_LOG_LEVELS["RAG"])`

			`known_source_ext = [`
			`"go",`
			`"py",`
			`"java",`
			`"sh",`
			`"bat",`
			`"ps1",`
			`"cmd",`
			`"js",`
			`"ts",`
			`"css",`
			`"cpp",`
			`"hpp",`
			`"h",`
			`"c",`
			`"cs",`
			`"sql",`
			`"log",`
			`"ini",`
			`"pl",`
			`"pm",`
			`"r",`
			`"dart",`
			`"dockerfile",`
			`"env",`
			`"php",`
			`"hs",`
			`"hsc",`
			`"lua",`
			`"nginxconf",`
			`"conf",`
			`"m",`
			`"mm",`
			`"plsql",`
			`"perl",`
			`"rb",`
			`"rs",`
			`"db2",`
			`"scala",`
			`"bash",`
			`"swift",`
			`"vue",`
			`"svelte",`
			`"ex",`
			`"exs",`
			`"erl",`
			`"tsx",`
			`"jsx",`
			`"hs",`
			`"lhs",`
adding a comma 2025-02-26 14:27:03 +00:00			`"json",`
refac 2024-09-28 00:23:09 +00:00			`]`


			`class TikaLoader:`
fix: tikaloader extract images 2025-05-05 19:40:34 +00:00			`def __init__(self, url, file_path, mime_type=None, extract_images=None):`
refac 2024-09-28 00:23:09 +00:00			`self.url = url`
			`self.file_path = file_path`
			`self.mime_type = mime_type`

refac 2025-05-05 20:46:32 +00:00			`self.extract_images = extract_images`
fix: tikaloader extract images 2025-05-05 19:40:34 +00:00
refac 2024-09-28 00:23:09 +00:00			`def load(self) -> list[Document]:`
			`with open(self.file_path, "rb") as f:`
			`data = f.read()`

			`if self.mime_type is not None:`
			`headers = {"Content-Type": self.mime_type}`
			`else:`
			`headers = {}`

fix: tikaloader extract images 2025-05-05 19:40:34 +00:00			`if self.extract_images == True:`
feat(ocr): add support for Docling OCR engine and language configuration This commit adds support for configuring the OCR engine and language(s) for Docling. Configuration can be set via the environment variables `DOCLING_OCR_ENGINE` and `DOCLING_OCR_LANG`, or through the UI. Fixes #13133 2025-05-02 21:31:00 +00:00			`headers["X-Tika-PDFextractInlineImages"] = "true"`
fix: pass header to Tika if PDF_EXTRACT_IMAGES is true 2025-04-20 15:36:40 +00:00
refac 2024-09-28 00:23:09 +00:00			`endpoint = self.url`
			`if not endpoint.endswith("/"):`
			`endpoint += "/"`
			`endpoint += "tika/text"`

			`r = requests.put(endpoint, data=data, headers=headers)`

			`if r.ok:`
			`raw_metadata = r.json()`
Fix: Tika 3.1.0.0 sends a lot of blank lines which degrades the RAG results, strip them. 2025-03-25 17:53:14 +00:00			`text = raw_metadata.get("X-TIKA:content", "<No text content found>").strip()`
refac 2024-09-28 00:23:09 +00:00
			`if "Content-Type" in raw_metadata:`
			`headers["Content-Type"] = raw_metadata["Content-Type"]`

Only log file contents in debug 2024-12-16 02:58:26 +00:00			`log.debug("Tika extracted text: %s", text)`
refac 2024-09-28 00:23:09 +00:00
			`return [Document(page_content=text, metadata=headers)]`
			`else:`
			`raise Exception(f"Error calling Tika: {r.reason}")`


feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`class DoclingLoader:`
feat: docling do picture description support 2025-05-14 17:26:49 +00:00			`def __init__(self, url, file_path=None, mime_type=None, params=None):`
style: change style in DoclingLoader 2025-03-05 23:15:55 +00:00			`self.url = url.rstrip("/")`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`self.file_path = file_path`
			`self.mime_type = mime_type`
feat: docling do picture description support 2025-05-14 17:26:49 +00:00
			`self.params = params or {}`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00
			`def load(self) -> list[Document]:`
			`with open(self.file_path, "rb") as f:`
fix: fix DoclingLoader input params 2025-03-05 17:53:45 +00:00			`files = {`
			`"files": (`
			`self.file_path,`
			`f,`
			`self.mime_type or "application/octet-stream",`
			`)`
			`}`

feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`params = {`
fix: fix DoclingLoader input params 2025-03-05 17:53:45 +00:00			`"image_export_mode": "placeholder",`
fix description vs classification mismatch 2025-06-04 12:13:00 +00:00			`"table_mode": "accurate"`
fix: fix DoclingLoader input params 2025-03-05 17:53:45 +00:00			`}`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00
feat: docling do picture description support 2025-05-14 17:26:49 +00:00			`if self.params:`
fix description vs classification mismatch 2025-06-04 12:13:00 +00:00			`if self.params.get("do_picture_description"):`
			`params["do_picture_description"] = self.params.get(`
			`"do_picture_description"`
feat: docling do picture description support 2025-05-14 17:26:49 +00:00			`)`

			`if self.params.get("ocr_engine") and self.params.get("ocr_lang"):`
			`params["ocr_engine"] = self.params.get("ocr_engine")`
			`params["ocr_lang"] = [`
			`lang.strip()`
			`for lang in self.params.get("ocr_lang").split(",")`
			`if lang.strip()`
			`]`
feat(ocr): add support for Docling OCR engine and language configuration This commit adds support for configuring the OCR engine and language(s) for Docling. Configuration can be set via the environment variables `DOCLING_OCR_ENGINE` and `DOCLING_OCR_LANG`, or through the UI. Fixes #13133 2025-05-02 21:31:00 +00:00
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`endpoint = f"{self.url}/v1alpha/convert/file"`
style: change style in DoclingLoader 2025-03-05 23:15:55 +00:00			`r = requests.post(endpoint, files=files, data=params)`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00
style: change style in DoclingLoader 2025-03-05 23:15:55 +00:00			`if r.ok:`
			`result = r.json()`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`document_data = result.get("document", {})`
			`text = document_data.get("md_content", "<No text content found>")`

			`metadata = {"Content-Type": self.mime_type} if self.mime_type else {}`
fix: fix DoclingLoader input params 2025-03-05 17:53:45 +00:00
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`log.debug("Docling extracted text: %s", text)`

			`return [Document(page_content=text, metadata=metadata)]`
			`else:`
style: change style in DoclingLoader 2025-03-05 23:15:55 +00:00			`error_msg = f"Error calling Docling API: {r.reason}"`
			`if r.text:`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`try:`
style: change style in DoclingLoader 2025-03-05 23:15:55 +00:00			`error_data = r.json()`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`if "detail" in error_data:`
			`error_msg += f" - {error_data['detail']}"`
style: change style in DoclingLoader 2025-03-05 23:15:55 +00:00			`except Exception:`
			`error_msg += f" - {r.text}"`
feat: docling support for document preprocessing 2025-02-14 12:08:03 +00:00			`raise Exception(f"Error calling Docling: {error_msg}")`


refac 2024-09-28 00:23:09 +00:00			`class Loader:`
			`def __init__(self, engine: str = "", **kwargs):`
			`self.engine = engine`
			`self.kwargs = kwargs`

			`def load(`
			`self, filename: str, file_content_type: str, file_path: str`
			`) -> list[Document]:`
			`loader = self._get_loader(filename, file_content_type, file_path)`
refac 2024-09-28 00:49:18 +00:00			`docs = loader.load()`

			`return [`
			`Document(`
			`page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata`
			`)`
			`for doc in docs`
			`]`
refac 2024-09-28 00:23:09 +00:00
fix: fix for text file handling with docling 2025-04-05 16:44:08 +00:00			`def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:`
			`return file_ext in known_source_ext or (`
			`file_content_type and file_content_type.find("text/") >= 0`
			`)`

refac 2024-09-28 00:23:09 +00:00			`def _get_loader(self, filename: str, file_content_type: str, file_path: str):`
			`file_ext = filename.split(".")[-1].lower()`

feat: external document loader support 2025-05-14 18:28:40 +00:00			`if (`
			`self.engine == "external"`
			`and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL")`
			`and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY")`
			`):`
			`loader = ExternalDocumentLoader(`
			`file_path=file_path,`
			`url=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL"),`
			`api_key=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY"),`
			`mime_type=file_content_type,`
			`)`
minor bug fix for external document loader not working 2025-05-20 05:40:23 +00:00			`elif self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):`
fix: fix for text file handling with docling 2025-04-05 16:44:08 +00:00			`if self._is_text_file(file_ext, file_content_type):`
refac 2024-09-28 00:23:09 +00:00			`loader = TextLoader(file_path, autodetect_encoding=True)`
			`else:`
			`loader = TikaLoader(`
			`url=self.kwargs.get("TIKA_SERVER_URL"),`
			`file_path=file_path,`
			`mime_type=file_content_type,`
fix: tikaloader extract images 2025-05-05 19:40:34 +00:00			`extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),`
refac 2024-09-28 00:23:09 +00:00			`)`
feat: Marker api content extraction support 2025-05-27 04:44:07 +00:00			`elif (`
			`self.engine == "datalab_marker"`
			`and self.kwargs.get("DATALAB_MARKER_API_KEY")`
chore: format 2025-05-28 22:36:33 +00:00			`and file_ext`
			`in [`
			`"pdf",`
			`"xls",`
			`"xlsx",`
			`"ods",`
			`"doc",`
			`"docx",`
			`"odt",`
			`"ppt",`
			`"pptx",`
			`"odp",`
			`"html",`
			`"epub",`
			`"png",`
			`"jpeg",`
			`"jpg",`
			`"webp",`
			`"gif",`
			`"tiff",`
			`]`
feat: Marker api content extraction support 2025-05-27 04:44:07 +00:00			`):`
			`loader = DatalabMarkerLoader(`
			`file_path=file_path,`
			`api_key=self.kwargs["DATALAB_MARKER_API_KEY"],`
			`langs=self.kwargs.get("DATALAB_MARKER_LANGS"),`
			`use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),`
			`skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),`
			`force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),`
			`paginate=self.kwargs.get("DATALAB_MARKER_PAGINATE", False),`
chore: format 2025-05-28 22:36:33 +00:00			`strip_existing_ocr=self.kwargs.get(`
			`"DATALAB_MARKER_STRIP_EXISTING_OCR", False`
			`),`
			`disable_image_extraction=self.kwargs.get(`
			`"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False`
			`),`
			`output_format=self.kwargs.get(`
			`"DATALAB_MARKER_OUTPUT_FORMAT", "markdown"`
			`),`
feat: Marker api content extraction support 2025-05-27 04:44:07 +00:00			`)`
feat: merge with main 2025-03-05 22:04:34 +00:00			`elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):`
fix: fix for text file handling with docling 2025-04-05 16:44:08 +00:00			`if self._is_text_file(file_ext, file_content_type):`
			`loader = TextLoader(file_path, autodetect_encoding=True)`
			`else:`
			`loader = DoclingLoader(`
			`url=self.kwargs.get("DOCLING_SERVER_URL"),`
			`file_path=file_path,`
			`mime_type=file_content_type,`
feat: docling do picture description support 2025-05-14 17:26:49 +00:00			`params={`
			`"ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"),`
			`"ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"),`
fix description vs classification mismatch 2025-06-04 12:13:00 +00:00			`"do_picture_description": self.kwargs.get(`
feat: docling do picture description support 2025-05-14 17:26:49 +00:00			`"DOCLING_DO_PICTURE_DESCRIPTION"`
			`),`
feat(loader): add picture description configuration for DoclingLoader 2025-06-04 10:34:39 +00:00			`"picture_description_local": (`
			`'{\n'`
			`' "repo_id": "HuggingFaceTB/SmolVLM-256M-Instruct",\n'`
feat(loader): enhance picture description prompt for improved detail and clarity 2025-06-04 12:25:31 +00:00			`' "prompt": "Analyze the image and provide a comprehensive, detailed description. Identify all visible objects, their attributes, actions taking place, spatial relationships, and any contextual or inferred connections. Use clear, structured, and informative language suitable for downstream retrieval or knowledge extraction tasks."\n'`
feat(loader): add picture description configuration for DoclingLoader 2025-06-04 10:34:39 +00:00			`'}'`
			`)`
feat: docling do picture description support 2025-05-14 17:26:49 +00:00			`},`
fix: fix for text file handling with docling 2025-04-05 16:44:08 +00:00			`)`
feat: Implement Document Intelligence as Content Extraction Engine 2025-02-07 12:44:47 +00:00			`elif (`
			`self.engine == "document_intelligence"`
			`and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""`
			`and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""`
			`and (`
			`file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]`
			`or file_content_type`
			`in [`
			`"application/vnd.ms-excel",`
			`"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",`
			`"application/vnd.openxmlformats-officedocument.wordprocessingml.document",`
			`"application/vnd.ms-powerpoint",`
			`"application/vnd.openxmlformats-officedocument.presentationml.presentation",`
			`]`
			`)`
			`):`
			`loader = AzureAIDocumentIntelligenceLoader(`
			`file_path=file_path,`
			`api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),`
			`api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),`
			`)`
Add Mistral OCR integration and configuration support 2025-03-22 12:44:50 +00:00			`elif (`
			`self.engine == "mistral_ocr"`
			`and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""`
			`and file_ext`
			`in ["pdf"] # Mistral OCR currently only supports PDF and images`
			`):`
			`loader = MistralLoader(`
			`api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path`
feat: external document loader support 2025-05-14 18:28:40 +00:00			`)`
			`elif (`
			`self.engine == "external"`
			`and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""`
			`and file_ext`
			`in ["pdf"] # Mistral OCR currently only supports PDF and images`
			`):`
			`loader = MistralLoader(`
			`api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path`
Add Mistral OCR integration and configuration support 2025-03-22 12:44:50 +00:00			`)`
refac 2024-09-28 00:23:09 +00:00			`else:`
			`if file_ext == "pdf":`
			`loader = PyPDFLoader(`
			`file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")`
			`)`
			`elif file_ext == "csv":`
Fix: CSV loader encoding issue using autodetect_encoding=True 2025-03-29 07:44:53 +00:00			`loader = CSVLoader(file_path, autodetect_encoding=True)`
refac 2024-09-28 00:23:09 +00:00			`elif file_ext == "rst":`
			`loader = UnstructuredRSTLoader(file_path, mode="elements")`
			`elif file_ext == "xml":`
			`loader = UnstructuredXMLLoader(file_path)`
			`elif file_ext in ["htm", "html"]:`
			`loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")`
			`elif file_ext == "md":`
refac: parse md files with TextLoader 2024-10-28 12:50:56 +00:00			`loader = TextLoader(file_path, autodetect_encoding=True)`
refac 2024-09-28 00:23:09 +00:00			`elif file_content_type == "application/epub+zip":`
			`loader = UnstructuredEPubLoader(file_path)`
			`elif (`
			`file_content_type`
			`== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"`
			`or file_ext == "docx"`
			`):`
			`loader = Docx2txtLoader(file_path)`
			`elif file_content_type in [`
			`"application/vnd.ms-excel",`
			`"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",`
			`] or file_ext in ["xls", "xlsx"]:`
			`loader = UnstructuredExcelLoader(file_path)`
			`elif file_content_type in [`
			`"application/vnd.ms-powerpoint",`
			`"application/vnd.openxmlformats-officedocument.presentationml.presentation",`
			`] or file_ext in ["ppt", "pptx"]:`
			`loader = UnstructuredPowerPointLoader(file_path)`
			`elif file_ext == "msg":`
			`loader = OutlookMessageLoader(file_path)`
fix: fix for text file handling with docling 2025-04-05 16:44:08 +00:00			`elif self._is_text_file(file_ext, file_content_type):`
refac 2024-09-28 00:23:09 +00:00			`loader = TextLoader(file_path, autodetect_encoding=True)`
			`else:`
			`loader = TextLoader(file_path, autodetect_encoding=True)`

			`return loader`