Files
SaaS-PDF/scripts/generate_sitemap.py
Your Name f347022924 feat: add SEO configuration and pages for programmatic tools and collections
- Introduced seoPages.ts to manage SEO-related configurations and types for programmatic tools and collection pages.
- Created SeoCollectionPage and SeoProgrammaticPage components to render SEO content dynamically based on the new configuration.
- Enhanced API service to ensure CSRF token handling for secure requests.
- Added generateHowTo utility function for structured data generation.
- Updated sitemap generation script to include SEO tool and collection pages.
- Configured TypeScript to resolve JSON modules for easier integration of SEO data.   ستراتيجية التنفيذ

لم أغير أي core logic في أدوات التحويل أو الضغط أو التحرير
استخدمت architecture إضافية فوق النظام الحالي بدل استبداله
جعلت الـ SEO pages تعتمد على source of truth واحد حتى يسهل التوسع
ربطت التوليد مع build حتى لا تبقى sitemap وrobots ثابتة أو منسية
دعمت العربية والإنجليزية داخل نفس config الجديد
عززت internal linking من:
صفحات SEO إلى tool pages
صفحات SEO إلى collection pages
footer إلى collection pages
Suggested tools داخل صفحات الأدوات
التحقق
2026-03-21 01:19:32 +02:00

219 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
generate_sitemap.py
Generates sitemap.xml for SEO from the full route inventory.
Usage:
python scripts/generate_sitemap.py --domain https://dociva.io
python scripts/generate_sitemap.py --domain https://dociva.io --output frontend/public/sitemap.xml
# Or set SITE_DOMAIN env var and omit --domain
"""
import argparse
import json
import os
import re
from datetime import datetime
from pathlib import Path
# ─── Route definitions with priority and changefreq ──────────────────────────
PAGES = [
{'path': '/', 'changefreq': 'daily', 'priority': '1.0'},
{'path': '/about', 'changefreq': 'monthly', 'priority': '0.4'},
{'path': '/contact', 'changefreq': 'monthly', 'priority': '0.4'},
{'path': '/privacy', 'changefreq': 'yearly', 'priority': '0.3'},
{'path': '/terms', 'changefreq': 'yearly', 'priority': '0.3'},
{'path': '/pricing', 'changefreq': 'monthly', 'priority': '0.7'},
{'path': '/blog', 'changefreq': 'weekly', 'priority': '0.6'},
]
# PDF Tools
PDF_TOOLS = [
{'slug': 'pdf-to-word', 'priority': '0.9'},
{'slug': 'word-to-pdf', 'priority': '0.9'},
{'slug': 'compress-pdf', 'priority': '0.9'},
{'slug': 'merge-pdf', 'priority': '0.9'},
{'slug': 'split-pdf', 'priority': '0.8'},
{'slug': 'rotate-pdf', 'priority': '0.7'},
{'slug': 'pdf-to-images', 'priority': '0.8'},
{'slug': 'images-to-pdf', 'priority': '0.8'},
{'slug': 'watermark-pdf', 'priority': '0.7'},
{'slug': 'remove-watermark-pdf','priority': '0.7'},
{'slug': 'protect-pdf', 'priority': '0.8'},
{'slug': 'unlock-pdf', 'priority': '0.8'},
{'slug': 'page-numbers', 'priority': '0.7'},
{'slug': 'reorder-pdf', 'priority': '0.7'},
{'slug': 'extract-pages', 'priority': '0.7'},
{'slug': 'pdf-editor', 'priority': '0.8'},
{'slug': 'pdf-flowchart', 'priority': '0.7'},
{'slug': 'pdf-to-excel', 'priority': '0.8'},
# Phase 2
{'slug': 'sign-pdf', 'priority': '0.8'},
{'slug': 'crop-pdf', 'priority': '0.7'},
{'slug': 'flatten-pdf', 'priority': '0.7'},
{'slug': 'repair-pdf', 'priority': '0.7'},
{'slug': 'pdf-metadata', 'priority': '0.6'},
]
# Image Tools
IMAGE_TOOLS = [
{'slug': 'image-converter', 'priority': '0.8'},
{'slug': 'image-resize', 'priority': '0.8'},
{'slug': 'compress-image', 'priority': '0.8'},
{'slug': 'remove-background', 'priority': '0.8'},
# Phase 2
{'slug': 'image-crop', 'priority': '0.7'},
{'slug': 'image-rotate-flip', 'priority': '0.7'},
]
# AI Tools
AI_TOOLS = [
{'slug': 'ocr', 'priority': '0.8'},
{'slug': 'chat-pdf', 'priority': '0.8'},
{'slug': 'summarize-pdf', 'priority': '0.8'},
{'slug': 'translate-pdf', 'priority': '0.8'},
{'slug': 'extract-tables', 'priority': '0.8'},
]
# Convert / Utility Tools
UTILITY_TOOLS = [
{'slug': 'html-to-pdf', 'priority': '0.7'},
{'slug': 'qr-code', 'priority': '0.7'},
{'slug': 'video-to-gif', 'priority': '0.7'},
{'slug': 'word-counter', 'priority': '0.6'},
{'slug': 'text-cleaner', 'priority': '0.6'},
# Phase 2
{'slug': 'pdf-to-pptx', 'priority': '0.8'},
{'slug': 'excel-to-pdf', 'priority': '0.8'},
{'slug': 'pptx-to-pdf', 'priority': '0.8'},
{'slug': 'barcode-generator', 'priority': '0.7'},
]
TOOL_GROUPS = [
('PDF Tools', PDF_TOOLS),
('Image Tools', IMAGE_TOOLS),
('AI Tools', AI_TOOLS),
('Utility Tools', UTILITY_TOOLS),
]
def get_seo_landing_paths() -> tuple[list[str], list[str]]:
repo_root = Path(__file__).resolve().parents[1]
seo_config_path = repo_root / 'frontend' / 'src' / 'config' / 'seo-tools.json'
if not seo_config_path.exists():
return [], []
raw = json.loads(seo_config_path.read_text(encoding='utf-8'))
tool_pages = [entry.get('slug', '').strip() for entry in raw.get('toolPages', []) if entry.get('slug')]
collection_pages = [entry.get('slug', '').strip() for entry in raw.get('collectionPages', []) if entry.get('slug')]
return tool_pages, collection_pages
def get_blog_slugs() -> list[str]:
repo_root = Path(__file__).resolve().parents[1]
blog_articles_path = repo_root / 'frontend' / 'src' / 'content' / 'blogArticles.ts'
if not blog_articles_path.exists():
return []
content = blog_articles_path.read_text(encoding='utf-8')
return list(dict.fromkeys(re.findall(r"slug:\s*'([^']+)'", content)))
def generate_sitemap(domain: str) -> str:
today = datetime.now().strftime('%Y-%m-%d')
urls = []
blog_slugs = get_blog_slugs()
seo_tool_pages, seo_collection_pages = get_seo_landing_paths()
# Static pages
for page in PAGES:
urls.append(f''' <url>
<loc>{domain}{page["path"]}</loc>
<lastmod>{today}</lastmod>
<changefreq>{page["changefreq"]}</changefreq>
<priority>{page["priority"]}</priority>
</url>''')
if blog_slugs:
urls.append('\n <!-- Blog Posts -->')
for slug in blog_slugs:
urls.append(f''' <url>
<loc>{domain}/blog/{slug}</loc>
<lastmod>{today}</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>''')
# Tool pages by category
for label, routes in TOOL_GROUPS:
urls.append(f'\n <!-- {label} -->')
for route in routes:
urls.append(f''' <url>
<loc>{domain}/tools/{route["slug"]}</loc>
<lastmod>{today}</lastmod>
<changefreq>weekly</changefreq>
<priority>{route["priority"]}</priority>
</url>''')
if seo_tool_pages:
urls.append('\n <!-- Programmatic SEO Tool Pages -->')
for slug in seo_tool_pages:
urls.append(f''' <url>
<loc>{domain}/{slug}</loc>
<lastmod>{today}</lastmod>
<changefreq>weekly</changefreq>
<priority>0.88</priority>
</url>''')
if seo_collection_pages:
urls.append('\n <!-- SEO Collection Pages -->')
for slug in seo_collection_pages:
urls.append(f''' <url>
<loc>{domain}/{slug}</loc>
<lastmod>{today}</lastmod>
<changefreq>weekly</changefreq>
<priority>0.82</priority>
</url>''')
sitemap = f'''<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{chr(10).join(urls)}
</urlset>'''
return sitemap
def main():
parser = argparse.ArgumentParser(description='Generate sitemap.xml')
parser.add_argument('--domain', type=str, default=os.environ.get('SITE_DOMAIN', ''),
help='Site domain (e.g. https://dociva.io). Falls back to SITE_DOMAIN env var.')
parser.add_argument('--output', type=str, default='frontend/public/sitemap.xml', help='Output file path')
args = parser.parse_args()
if not args.domain:
parser.error('--domain is required (or set SITE_DOMAIN env var)')
domain = args.domain.rstrip('/')
sitemap = generate_sitemap(domain)
with open(args.output, 'w', encoding='utf-8') as f:
f.write(sitemap)
seo_tool_pages, seo_collection_pages = get_seo_landing_paths()
total = (
len(PAGES)
+ len(get_blog_slugs())
+ sum(len(routes) for _, routes in TOOL_GROUPS)
+ len(seo_tool_pages)
+ len(seo_collection_pages)
)
print(f"Sitemap generated: {args.output}")
print(f"Total URLs: {total}")
if __name__ == '__main__':
main()