Merge pull request #15548 from expruc/fix/docling_ignore_html

fix: text/html files being detected as text when loaded with docling/tika
This commit is contained in:
Tim Jaeryang Baek 2025-07-08 13:16:01 +04:00 committed by GitHub
commit a748f19ac2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -226,7 +226,10 @@ class Loader:
def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
return file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
file_content_type
and file_content_type.find("text/") >= 0
# Avoid text/html files being detected as text
and not file_content_type.find("html") >= 0
)
def _get_loader(self, filename: str, file_content_type: str, file_path: str):