Refactor code structure for improved readability and maintainability

This commit is contained in:
Your Name
2026-03-29 20:17:52 +02:00
parent 03c451abe5
commit f82a77febe
4 changed files with 502 additions and 312 deletions

View File

@@ -1,11 +1,6 @@
"""File validation utilities — multi-layer security checks."""
import os
try:
import magic
HAS_MAGIC = True
except (ImportError, OSError):
HAS_MAGIC = False
import os
from flask import current_app
from werkzeug.utils import secure_filename
@@ -45,30 +40,60 @@ def validate_file(
if not file_storage or file_storage.filename == "":
raise FileValidationError("No file provided.")
filename = secure_filename(file_storage.filename)
if not filename:
raise FileValidationError("Invalid filename.")
raw_filename = str(file_storage.filename).strip()
if not raw_filename:
raise FileValidationError("No file provided.")
# Layer 2: Check file extension against whitelist
ext = _get_extension(filename)
filename = secure_filename(raw_filename)
allowed_extensions = config.get("ALLOWED_EXTENSIONS", {})
if allowed_types:
valid_extensions = {k: v for k, v in allowed_extensions.items() if k in allowed_types}
valid_extensions = {
k: v for k, v in allowed_extensions.items() if k in allowed_types
}
else:
valid_extensions = allowed_extensions
# Layer 2: Reject clearly invalid extensions before touching file streams.
ext = _get_extension(raw_filename) or _get_extension(filename)
if ext and ext not in valid_extensions:
raise FileValidationError(
f"File type '.{ext}' is not allowed. "
f"Allowed types: {', '.join(valid_extensions.keys())}"
)
# Layer 3: Check basic file size and header first so we can recover
# from malformed filenames like ".pdf" or "." using content sniffing.
file_storage.seek(0, os.SEEK_END)
file_size = file_storage.tell()
file_storage.seek(0)
if file_size == 0:
raise FileValidationError("File is empty.")
file_header = file_storage.read(8192)
file_storage.seek(0)
detected_mime = _detect_mime(file_header)
if not ext:
ext = _infer_extension_from_content(
file_header, detected_mime, valid_extensions
)
if raw_filename.startswith(".") and not _get_extension(filename):
filename = ""
if not filename:
filename = f"upload.{ext}" if ext else "upload"
if ext not in valid_extensions:
raise FileValidationError(
f"File type '.{ext}' is not allowed. "
f"Allowed types: {', '.join(valid_extensions.keys())}"
)
# Layer 3: Check file size against type-specific limits
file_storage.seek(0, os.SEEK_END)
file_size = file_storage.tell()
file_storage.seek(0)
# Layer 4: Check file size against type-specific limits
size_limits = size_limit_overrides or config.get("FILE_SIZE_LIMITS", {})
max_size = size_limits.get(ext, 20 * 1024 * 1024) # Default 20MB
@@ -78,15 +103,8 @@ def validate_file(
f"File too large. Maximum size for .{ext} files is {max_mb:.0f}MB."
)
if file_size == 0:
raise FileValidationError("File is empty.")
# Layer 4: Check MIME type using magic bytes (if libmagic is available)
file_header = file_storage.read(8192)
file_storage.seek(0)
if HAS_MAGIC:
detected_mime = magic.from_buffer(file_header, mime=True)
# Layer 5: Check MIME type using magic bytes (if libmagic is available)
if detected_mime:
expected_mimes = valid_extensions.get(ext, [])
if detected_mime not in expected_mimes:
@@ -95,7 +113,7 @@ def validate_file(
f"Detected type: {detected_mime}"
)
# Layer 5: Additional content checks for specific types
# Layer 6: Additional content checks for specific types
if ext == "pdf":
_check_pdf_safety(file_header)
@@ -104,9 +122,52 @@ def validate_file(
def _get_extension(filename: str) -> str:
"""Extract and normalize file extension."""
if "." not in filename:
filename = str(filename or "").strip()
if not filename or "." not in filename:
return ""
return filename.rsplit(".", 1)[1].lower()
stem, ext = filename.rsplit(".", 1)
if not ext:
return ""
if not stem and filename.startswith("."):
return ext.lower()
return ext.lower()
def _detect_mime(file_header: bytes) -> str | None:
"""Detect MIME type lazily so environments without libmagic stay usable."""
try:
import magic as magic_module
except (ImportError, OSError):
return None
try:
return magic_module.from_buffer(file_header, mime=True)
except Exception:
return None
def _infer_extension_from_content(
file_header: bytes,
detected_mime: str | None,
valid_extensions: dict[str, list[str]],
) -> str:
"""Infer a safe extension from MIME type or common signatures."""
if detected_mime:
for ext, mimes in valid_extensions.items():
if detected_mime in mimes:
return ext
signature_map = {
b"%PDF": "pdf",
b"\x89PNG\r\n\x1a\n": "png",
b"\xff\xd8\xff": "jpg",
b"RIFF": "webp",
}
for signature, ext in signature_map.items():
if file_header.startswith(signature) and ext in valid_extensions:
return ext
return ""
def _check_pdf_safety(file_header: bytes):

View File

@@ -1,6 +1,7 @@
"""Tests for file validation utility."""
import io
from unittest.mock import patch, MagicMock
from unittest.mock import MagicMock
from app.utils.file_validator import validate_file, FileValidationError
import pytest
@@ -16,7 +17,7 @@ class TestFileValidator:
"""Should raise when filename is empty."""
with app.app_context():
mock_file = MagicMock()
mock_file.filename = ''
mock_file.filename = ""
with pytest.raises(FileValidationError, match="No file provided"):
validate_file(mock_file, allowed_types=["pdf"])
@@ -24,16 +25,16 @@ class TestFileValidator:
"""Should raise when file extension is not allowed."""
with app.app_context():
mock_file = MagicMock()
mock_file.filename = 'test.exe'
mock_file.filename = "test.exe"
with pytest.raises(FileValidationError, match="not allowed"):
validate_file(mock_file, allowed_types=["pdf"])
def test_empty_file_raises(self, app):
"""Should raise when file is empty (0 bytes)."""
with app.app_context():
content = io.BytesIO(b'')
content = io.BytesIO(b"")
mock_file = MagicMock()
mock_file.filename = 'test.pdf'
mock_file.filename = "test.pdf"
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
@@ -43,93 +44,150 @@ class TestFileValidator:
def test_valid_pdf_passes(self, app):
"""Should accept valid PDF file with correct magic bytes."""
with app.app_context():
pdf_bytes = b'%PDF-1.4 test content' + b'\x00' * 8192
pdf_bytes = b"%PDF-1.4 test content" + b"\x00" * 8192
content = io.BytesIO(pdf_bytes)
mock_file = MagicMock()
mock_file.filename = 'document.pdf'
mock_file.filename = "document.pdf"
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
with patch('app.utils.file_validator.HAS_MAGIC', True), patch(
'app.utils.file_validator.magic', create=True
) as mock_magic:
mock_magic.from_buffer.return_value = 'application/pdf'
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "application/pdf",
)
filename, ext = validate_file(mock_file, allowed_types=["pdf"])
assert filename == 'document.pdf'
assert ext == 'pdf'
assert filename == "document.pdf"
assert ext == "pdf"
def test_valid_html_passes(self, app):
"""Should accept valid HTML file with correct MIME type."""
with app.app_context():
html_bytes = b'<!doctype html><html><body>Hello</body></html>'
html_bytes = b"<!doctype html><html><body>Hello</body></html>"
content = io.BytesIO(html_bytes)
mock_file = MagicMock()
mock_file.filename = 'page.html'
mock_file.filename = "page.html"
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
with patch('app.utils.file_validator.HAS_MAGIC', True), patch(
'app.utils.file_validator.magic', create=True
) as mock_magic:
mock_magic.from_buffer.return_value = 'text/html'
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "text/html",
)
filename, ext = validate_file(mock_file, allowed_types=["html", "htm"])
assert filename == 'page.html'
assert ext == 'html'
assert filename == "page.html"
assert ext == "html"
def test_mime_mismatch_raises(self, app):
"""Should raise when MIME type doesn't match extension."""
with app.app_context():
content = io.BytesIO(b'not a real pdf' + b'\x00' * 8192)
content = io.BytesIO(b"not a real pdf" + b"\x00" * 8192)
mock_file = MagicMock()
mock_file.filename = 'fake.pdf'
mock_file.filename = "fake.pdf"
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
with patch('app.utils.file_validator.HAS_MAGIC', True), patch(
'app.utils.file_validator.magic', create=True
) as mock_magic:
mock_magic.from_buffer.return_value = 'text/plain'
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "text/plain",
)
with pytest.raises(FileValidationError, match="does not match"):
validate_file(mock_file, allowed_types=["pdf"])
def test_file_too_large_raises(self, app):
"""Should raise when file exceeds size limit."""
with app.app_context():
# Create a file larger than the PDF size limit (20MB)
large_content = io.BytesIO(b'%PDF-1.4' + b'\x00' * (21 * 1024 * 1024))
# Use a small override to keep the test stable on Windows/Python 3.13.
large_content = io.BytesIO(b"%PDF-1.4" + b"\x00" * 2048)
mock_file = MagicMock()
mock_file.filename = 'large.pdf'
mock_file.filename = "large.pdf"
mock_file.seek = large_content.seek
mock_file.tell = large_content.tell
mock_file.read = large_content.read
with pytest.raises(FileValidationError, match="too large"):
validate_file(mock_file, allowed_types=["pdf"])
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "application/pdf",
)
with pytest.raises(FileValidationError, match="too large"):
validate_file(
mock_file,
allowed_types=["pdf"],
size_limit_overrides={"pdf": 1024},
)
def test_dangerous_pdf_raises(self, app):
"""Should raise when PDF contains dangerous patterns."""
with app.app_context():
pdf_bytes = b'%PDF-1.4 /JavaScript evil_code' + b'\x00' * 8192
pdf_bytes = b"%PDF-1.4 /JavaScript evil_code" + b"\x00" * 8192
content = io.BytesIO(pdf_bytes)
mock_file = MagicMock()
mock_file.filename = 'evil.pdf'
mock_file.filename = "evil.pdf"
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
with patch('app.utils.file_validator.HAS_MAGIC', True), patch(
'app.utils.file_validator.magic', create=True
) as mock_magic:
mock_magic.from_buffer.return_value = 'application/pdf'
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "application/pdf",
)
with pytest.raises(FileValidationError, match="unsafe"):
validate_file(mock_file, allowed_types=["pdf"])
def test_pdf_with_missing_extension_name_is_inferred(self, app):
"""Should infer PDF extension from content when filename lacks one."""
with app.app_context():
pdf_bytes = b"%PDF-1.4 test content" + b"\x00" * 8192
content = io.BytesIO(pdf_bytes)
mock_file = MagicMock()
mock_file.filename = "."
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "application/pdf",
)
filename, ext = validate_file(mock_file, allowed_types=["pdf"])
assert filename == "upload.pdf"
assert ext == "pdf"
def test_pdf_hidden_filename_keeps_pdf_extension(self, app):
"""Should preserve .pdf from hidden-style filenames like .pdf."""
with app.app_context():
pdf_bytes = b"%PDF-1.4 test content" + b"\x00" * 8192
content = io.BytesIO(pdf_bytes)
mock_file = MagicMock()
mock_file.filename = ".pdf"
mock_file.seek = content.seek
mock_file.tell = content.tell
mock_file.read = content.read
with pytest.MonkeyPatch.context() as monkeypatch:
monkeypatch.setattr(
"app.utils.file_validator._detect_mime",
lambda _header: "application/pdf",
)
filename, ext = validate_file(mock_file, allowed_types=["pdf"])
assert filename == "upload.pdf"
assert ext == "pdf"