diff --git a/.env.example b/.env.example index c53637e..018d569 100644 --- a/.env.example +++ b/.env.example @@ -2,7 +2,7 @@ FLASK_ENV=production FLASK_DEBUG=0 SECRET_KEY=replace-with-a-long-random-secret-key -INTERNAL_ADMIN_EMAILS=admin@dociva.io +INTERNAL_ADMIN_EMAILS=support@dociva.io # Site Domain (used in sitemap, robots.txt, emails) SITE_DOMAIN=https://dociva.io diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..25c5b7b --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,71 @@ +# Copilot Workspace Instructions + +Purpose +- Help Copilot-style agents and contributors be productive and safe in this repository. +- Surface where to find authoritative docs and which conventions to follow. + +Principles +- Link, don't embed: prefer linking to existing docs in `docs/`, `CONTRIBUTING.md`, or `README.md` rather than duplicating content. +- Minimize blast radius: make minimal, focused changes and explain rationale in PRs. +- Ask clarifying questions before large or ambiguous changes. + +What the agent is allowed to do +- Suggest edits, create focused patches, and propose new files following repo style. +- Use `apply_patch` for file edits; create new files only when necessary. +- Run or suggest commands to run tests locally, but do not push or merge without human approval. + +Conventions & expectations +- Follow existing code style and directory boundaries (`backend/` for Flask/Python, `frontend/` for Vite/TypeScript). +- When changing behavior, run tests and list the commands to reproduce the failure/fix. +- Keep PRs small and target a single logical change. + +Key files & links (authoritative sources) +- README: [README.md](README.md#L1) +- Contribution & tests: [CONTRIBUTING.md](CONTRIBUTING.md#L1) +- Docker & run commands: [docs/Docker-Commands-Guide.md](docs/Docker-Commands-Guide.md#L1) +- Backend entry & requirements: [backend/requirements.txt](backend/requirements.txt#L1), [backend/Dockerfile](backend/Dockerfile#L1) +- Frontend scripts: [frontend/package.json](frontend/package.json#L1), [frontend/Dockerfile](frontend/Dockerfile#L1) +- Compose files: [docker-compose.yml](docker-compose.yml#L1), [docker-compose.prod.yml](docker-compose.prod.yml#L1) +- Deployment scripts: [scripts/deploy.sh](scripts/deploy.sh#L1) + +Common build & test commands +- Backend tests (project root): +``` +cd backend && python -m pytest tests/ -q +``` +- Frontend dev & tests: +``` +cd frontend && npm install +cd frontend && npm run dev +cd frontend && npx vitest run +``` +- Dev compose (full stack): +``` +docker compose up --build +``` +- Prod deploy (refer to `scripts/deploy.sh`): +``` +./scripts/deploy.sh +``` + +Anti-patterns (avoid) +- Don't invent architectural decisions or rewrite large areas without explicit approval. +- Don't add secrets, large binary files, or unrelated formatting changes. +- Don't run destructive commands or modify CI/CD configuration without coordination. + +Agent prompts & examples +- "Create a small Flask route in `backend/app/routes` that returns health JSON and add a unit test." +- "Refactor the image compression service to extract a helper; update callers and tests." +- "List the exact commands I should run to reproduce the failing tests for `backend/tests/test_pdf_service.py`." + +Suggested follow-ups (agent customizations) +- `create-agent:backend` — focused on Python/Flask edits, runs `pytest`, and knows `backend/` structure. +- `create-agent:frontend` — focused on Vite/TypeScript, runs `vitest`, and uses `npm` scripts. +- `create-agent:ci` — analyzes `docker-compose.yml` and `scripts/deploy.sh`, suggests CI checks and smoke tests. + +If you want, I can: +- Open a draft PR with this file, or +- Expand the file with more precise command snippets and per-service README links. + +--- +Generated by a workspace bootstrap; iterate as needed. diff --git a/backend/app/__init__.py b/backend/app/__init__.py index 4edb772..689db5d 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -1,4 +1,5 @@ """Flask Application Factory.""" + import os from flask import Flask, jsonify @@ -11,7 +12,12 @@ from app.services.ai_cost_service import init_ai_cost_db from app.services.site_assistant_service import init_site_assistant_db from app.services.contact_service import init_contact_db from app.services.stripe_service import init_stripe_db -from app.utils.csrf import CSRFError, apply_csrf_cookie, should_enforce_csrf, validate_csrf_request +from app.utils.csrf import ( + CSRFError, + apply_csrf_cookie, + should_enforce_csrf, + validate_csrf_request, +) def _init_sentry(app): @@ -35,13 +41,15 @@ def _init_sentry(app): app.logger.warning("sentry-sdk not installed — monitoring disabled.") -def create_app(config_name=None): +def create_app(config_name=None, config_overrides=None): """Create and configure the Flask application.""" if config_name is None: config_name = os.getenv("FLASK_ENV", "development") app = Flask(__name__) app.config.from_object(config[config_name]) + if config_overrides: + app.config.update(config_overrides) # Initialize Sentry early _init_sentry(app) diff --git a/backend/app/utils/file_validator.py b/backend/app/utils/file_validator.py index 3dd1fa7..a0ff552 100644 --- a/backend/app/utils/file_validator.py +++ b/backend/app/utils/file_validator.py @@ -1,11 +1,6 @@ """File validation utilities — multi-layer security checks.""" -import os -try: - import magic - HAS_MAGIC = True -except (ImportError, OSError): - HAS_MAGIC = False +import os from flask import current_app from werkzeug.utils import secure_filename @@ -45,30 +40,60 @@ def validate_file( if not file_storage or file_storage.filename == "": raise FileValidationError("No file provided.") - filename = secure_filename(file_storage.filename) - if not filename: - raise FileValidationError("Invalid filename.") + raw_filename = str(file_storage.filename).strip() + if not raw_filename: + raise FileValidationError("No file provided.") - # Layer 2: Check file extension against whitelist - ext = _get_extension(filename) + filename = secure_filename(raw_filename) allowed_extensions = config.get("ALLOWED_EXTENSIONS", {}) if allowed_types: - valid_extensions = {k: v for k, v in allowed_extensions.items() if k in allowed_types} + valid_extensions = { + k: v for k, v in allowed_extensions.items() if k in allowed_types + } else: valid_extensions = allowed_extensions + # Layer 2: Reject clearly invalid extensions before touching file streams. + ext = _get_extension(raw_filename) or _get_extension(filename) + if ext and ext not in valid_extensions: + raise FileValidationError( + f"File type '.{ext}' is not allowed. " + f"Allowed types: {', '.join(valid_extensions.keys())}" + ) + + # Layer 3: Check basic file size and header first so we can recover + # from malformed filenames like ".pdf" or "." using content sniffing. + file_storage.seek(0, os.SEEK_END) + file_size = file_storage.tell() + file_storage.seek(0) + + if file_size == 0: + raise FileValidationError("File is empty.") + + file_header = file_storage.read(8192) + file_storage.seek(0) + + detected_mime = _detect_mime(file_header) + + if not ext: + ext = _infer_extension_from_content( + file_header, detected_mime, valid_extensions + ) + + if raw_filename.startswith(".") and not _get_extension(filename): + filename = "" + + if not filename: + filename = f"upload.{ext}" if ext else "upload" + if ext not in valid_extensions: raise FileValidationError( f"File type '.{ext}' is not allowed. " f"Allowed types: {', '.join(valid_extensions.keys())}" ) - # Layer 3: Check file size against type-specific limits - file_storage.seek(0, os.SEEK_END) - file_size = file_storage.tell() - file_storage.seek(0) - + # Layer 4: Check file size against type-specific limits size_limits = size_limit_overrides or config.get("FILE_SIZE_LIMITS", {}) max_size = size_limits.get(ext, 20 * 1024 * 1024) # Default 20MB @@ -78,15 +103,8 @@ def validate_file( f"File too large. Maximum size for .{ext} files is {max_mb:.0f}MB." ) - if file_size == 0: - raise FileValidationError("File is empty.") - - # Layer 4: Check MIME type using magic bytes (if libmagic is available) - file_header = file_storage.read(8192) - file_storage.seek(0) - - if HAS_MAGIC: - detected_mime = magic.from_buffer(file_header, mime=True) + # Layer 5: Check MIME type using magic bytes (if libmagic is available) + if detected_mime: expected_mimes = valid_extensions.get(ext, []) if detected_mime not in expected_mimes: @@ -95,7 +113,7 @@ def validate_file( f"Detected type: {detected_mime}" ) - # Layer 5: Additional content checks for specific types + # Layer 6: Additional content checks for specific types if ext == "pdf": _check_pdf_safety(file_header) @@ -104,9 +122,52 @@ def validate_file( def _get_extension(filename: str) -> str: """Extract and normalize file extension.""" - if "." not in filename: + filename = str(filename or "").strip() + if not filename or "." not in filename: return "" - return filename.rsplit(".", 1)[1].lower() + stem, ext = filename.rsplit(".", 1) + if not ext: + return "" + if not stem and filename.startswith("."): + return ext.lower() + return ext.lower() + + +def _detect_mime(file_header: bytes) -> str | None: + """Detect MIME type lazily so environments without libmagic stay usable.""" + try: + import magic as magic_module + except (ImportError, OSError): + return None + + try: + return magic_module.from_buffer(file_header, mime=True) + except Exception: + return None + + +def _infer_extension_from_content( + file_header: bytes, + detected_mime: str | None, + valid_extensions: dict[str, list[str]], +) -> str: + """Infer a safe extension from MIME type or common signatures.""" + if detected_mime: + for ext, mimes in valid_extensions.items(): + if detected_mime in mimes: + return ext + + signature_map = { + b"%PDF": "pdf", + b"\x89PNG\r\n\x1a\n": "png", + b"\xff\xd8\xff": "jpg", + b"RIFF": "webp", + } + for signature, ext in signature_map.items(): + if file_header.startswith(signature) and ext in valid_extensions: + return ext + + return "" def _check_pdf_safety(file_header: bytes): diff --git a/backend/tests/test_file_validator.py b/backend/tests/test_file_validator.py index c95d099..16c8424 100644 --- a/backend/tests/test_file_validator.py +++ b/backend/tests/test_file_validator.py @@ -1,6 +1,7 @@ """Tests for file validation utility.""" + import io -from unittest.mock import patch, MagicMock +from unittest.mock import MagicMock from app.utils.file_validator import validate_file, FileValidationError import pytest @@ -16,7 +17,7 @@ class TestFileValidator: """Should raise when filename is empty.""" with app.app_context(): mock_file = MagicMock() - mock_file.filename = '' + mock_file.filename = "" with pytest.raises(FileValidationError, match="No file provided"): validate_file(mock_file, allowed_types=["pdf"]) @@ -24,16 +25,16 @@ class TestFileValidator: """Should raise when file extension is not allowed.""" with app.app_context(): mock_file = MagicMock() - mock_file.filename = 'test.exe' + mock_file.filename = "test.exe" with pytest.raises(FileValidationError, match="not allowed"): validate_file(mock_file, allowed_types=["pdf"]) def test_empty_file_raises(self, app): """Should raise when file is empty (0 bytes).""" with app.app_context(): - content = io.BytesIO(b'') + content = io.BytesIO(b"") mock_file = MagicMock() - mock_file.filename = 'test.pdf' + mock_file.filename = "test.pdf" mock_file.seek = content.seek mock_file.tell = content.tell mock_file.read = content.read @@ -43,93 +44,150 @@ class TestFileValidator: def test_valid_pdf_passes(self, app): """Should accept valid PDF file with correct magic bytes.""" with app.app_context(): - pdf_bytes = b'%PDF-1.4 test content' + b'\x00' * 8192 + pdf_bytes = b"%PDF-1.4 test content" + b"\x00" * 8192 content = io.BytesIO(pdf_bytes) mock_file = MagicMock() - mock_file.filename = 'document.pdf' + mock_file.filename = "document.pdf" mock_file.seek = content.seek mock_file.tell = content.tell mock_file.read = content.read - with patch('app.utils.file_validator.HAS_MAGIC', True), patch( - 'app.utils.file_validator.magic', create=True - ) as mock_magic: - mock_magic.from_buffer.return_value = 'application/pdf' + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "application/pdf", + ) filename, ext = validate_file(mock_file, allowed_types=["pdf"]) - assert filename == 'document.pdf' - assert ext == 'pdf' + assert filename == "document.pdf" + assert ext == "pdf" def test_valid_html_passes(self, app): """Should accept valid HTML file with correct MIME type.""" with app.app_context(): - html_bytes = b'
Hello' + html_bytes = b"Hello" content = io.BytesIO(html_bytes) mock_file = MagicMock() - mock_file.filename = 'page.html' + mock_file.filename = "page.html" mock_file.seek = content.seek mock_file.tell = content.tell mock_file.read = content.read - with patch('app.utils.file_validator.HAS_MAGIC', True), patch( - 'app.utils.file_validator.magic', create=True - ) as mock_magic: - mock_magic.from_buffer.return_value = 'text/html' + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "text/html", + ) filename, ext = validate_file(mock_file, allowed_types=["html", "htm"]) - assert filename == 'page.html' - assert ext == 'html' + assert filename == "page.html" + assert ext == "html" def test_mime_mismatch_raises(self, app): """Should raise when MIME type doesn't match extension.""" with app.app_context(): - content = io.BytesIO(b'not a real pdf' + b'\x00' * 8192) + content = io.BytesIO(b"not a real pdf" + b"\x00" * 8192) mock_file = MagicMock() - mock_file.filename = 'fake.pdf' + mock_file.filename = "fake.pdf" mock_file.seek = content.seek mock_file.tell = content.tell mock_file.read = content.read - with patch('app.utils.file_validator.HAS_MAGIC', True), patch( - 'app.utils.file_validator.magic', create=True - ) as mock_magic: - mock_magic.from_buffer.return_value = 'text/plain' + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "text/plain", + ) with pytest.raises(FileValidationError, match="does not match"): validate_file(mock_file, allowed_types=["pdf"]) def test_file_too_large_raises(self, app): """Should raise when file exceeds size limit.""" with app.app_context(): - # Create a file larger than the PDF size limit (20MB) - large_content = io.BytesIO(b'%PDF-1.4' + b'\x00' * (21 * 1024 * 1024)) + # Use a small override to keep the test stable on Windows/Python 3.13. + large_content = io.BytesIO(b"%PDF-1.4" + b"\x00" * 2048) mock_file = MagicMock() - mock_file.filename = 'large.pdf' + mock_file.filename = "large.pdf" mock_file.seek = large_content.seek mock_file.tell = large_content.tell mock_file.read = large_content.read - with pytest.raises(FileValidationError, match="too large"): - validate_file(mock_file, allowed_types=["pdf"]) + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "application/pdf", + ) + with pytest.raises(FileValidationError, match="too large"): + validate_file( + mock_file, + allowed_types=["pdf"], + size_limit_overrides={"pdf": 1024}, + ) def test_dangerous_pdf_raises(self, app): """Should raise when PDF contains dangerous patterns.""" with app.app_context(): - pdf_bytes = b'%PDF-1.4 /JavaScript evil_code' + b'\x00' * 8192 + pdf_bytes = b"%PDF-1.4 /JavaScript evil_code" + b"\x00" * 8192 content = io.BytesIO(pdf_bytes) mock_file = MagicMock() - mock_file.filename = 'evil.pdf' + mock_file.filename = "evil.pdf" mock_file.seek = content.seek mock_file.tell = content.tell mock_file.read = content.read - with patch('app.utils.file_validator.HAS_MAGIC', True), patch( - 'app.utils.file_validator.magic', create=True - ) as mock_magic: - mock_magic.from_buffer.return_value = 'application/pdf' + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "application/pdf", + ) with pytest.raises(FileValidationError, match="unsafe"): validate_file(mock_file, allowed_types=["pdf"]) + + def test_pdf_with_missing_extension_name_is_inferred(self, app): + """Should infer PDF extension from content when filename lacks one.""" + with app.app_context(): + pdf_bytes = b"%PDF-1.4 test content" + b"\x00" * 8192 + content = io.BytesIO(pdf_bytes) + + mock_file = MagicMock() + mock_file.filename = "." + mock_file.seek = content.seek + mock_file.tell = content.tell + mock_file.read = content.read + + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "application/pdf", + ) + filename, ext = validate_file(mock_file, allowed_types=["pdf"]) + + assert filename == "upload.pdf" + assert ext == "pdf" + + def test_pdf_hidden_filename_keeps_pdf_extension(self, app): + """Should preserve .pdf from hidden-style filenames like .pdf.""" + with app.app_context(): + pdf_bytes = b"%PDF-1.4 test content" + b"\x00" * 8192 + content = io.BytesIO(pdf_bytes) + + mock_file = MagicMock() + mock_file.filename = ".pdf" + mock_file.seek = content.seek + mock_file.tell = content.tell + mock_file.read = content.read + + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr( + "app.utils.file_validator._detect_mime", + lambda _header: "application/pdf", + ) + filename, ext = validate_file(mock_file, allowed_types=["pdf"]) + + assert filename == "upload.pdf" + assert ext == "pdf" diff --git a/backend/tests/test_rate_limiter.py b/backend/tests/test_rate_limiter.py index 35f4c31..526f224 100644 --- a/backend/tests/test_rate_limiter.py +++ b/backend/tests/test_rate_limiter.py @@ -1,6 +1,8 @@ """Tests for rate limiting middleware.""" + import pytest from app import create_app +from tests.conftest import CSRFTestClient @pytest.fixture @@ -11,33 +13,24 @@ def rate_limited_app(tmp_path): never throttled. Here we force the extension's internal flag back to True *after* init_app so the decorator limits are enforced. """ - app = create_app('testing') - app.config.update({ - 'TESTING': True, - 'RATELIMIT_STORAGE_URI': 'memory://', - 'UPLOAD_FOLDER': str(tmp_path / 'uploads'), - 'OUTPUT_FOLDER': str(tmp_path / 'outputs'), - }) + app = create_app( + "testing", + { + "TESTING": True, + "RATELIMIT_ENABLED": True, + "RATELIMIT_STORAGE_URI": "memory://", + "UPLOAD_FOLDER": str(tmp_path / "uploads"), + "OUTPUT_FOLDER": str(tmp_path / "outputs"), + }, + ) + app.test_client_class = CSRFTestClient import os - os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) - os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True) - # flask-limiter 3.x returns from init_app immediately when - # RATELIMIT_ENABLED=False (TestingConfig default), so `initialized` - # stays False and no limits are enforced. We override the config key - # and call init_app a SECOND time so the extension fully initialises. - # It is safe to call twice — flask-limiter guards against duplicate - # before_request hook registration via app.extensions["limiter"]. - from app.extensions import limiter as _limiter - app.config['RATELIMIT_ENABLED'] = True - _limiter.init_app(app) # second call — now RATELIMIT_ENABLED=True + os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True) + os.makedirs(app.config["OUTPUT_FOLDER"], exist_ok=True) yield app - # Restore so other tests are unaffected - _limiter.enabled = False - _limiter.initialized = False - @pytest.fixture def rate_limited_client(rate_limited_app): @@ -48,12 +41,12 @@ class TestRateLimiter: def test_health_endpoint_not_rate_limited(self, client): """Health endpoint should handle many rapid requests.""" for _ in range(20): - response = client.get('/api/health') + response = client.get("/api/health") assert response.status_code == 200 def test_rate_limit_header_present(self, client): """Response should include a valid HTTP status code.""" - response = client.get('/api/health') + response = client.get("/api/health") assert response.status_code == 200 @@ -68,7 +61,7 @@ class TestRateLimitEnforcement: """ blocked = False for i in range(15): - r = rate_limited_client.post('/api/compress/pdf') + r = rate_limited_client.post("/api/compress/pdf") if r.status_code == 429: blocked = True break @@ -81,7 +74,7 @@ class TestRateLimitEnforcement: """POST /api/convert/pdf-to-word is also rate-limited.""" blocked = False for _ in range(15): - r = rate_limited_client.post('/api/convert/pdf-to-word') + r = rate_limited_client.post("/api/convert/pdf-to-word") if r.status_code == 429: blocked = True break @@ -94,8 +87,8 @@ class TestRateLimitEnforcement: """ # Exhaust compress limit for _ in range(15): - rate_limited_client.post('/api/compress/pdf') + rate_limited_client.post("/api/compress/pdf") # Health should still respond normally - r = rate_limited_client.get('/api/health') - assert r.status_code == 200 \ No newline at end of file + r = rate_limited_client.get("/api/health") + assert r.status_code == 200 diff --git a/frontend/index.html b/frontend/index.html index 4d2e5b1..6ff05ac 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -7,12 +7,13 @@ + + + - - +