SaaS-PDF/scripts/build_keyword_portfolio.py

#!/usr/bin/env python3
"""
Build a multilingual keyword portfolio from Google Ads exports.

Usage:
  python scripts/build_keyword_portfolio.py
  python scripts/build_keyword_portfolio.py --output-dir docs/keyword-research/2026-04-05
"""

from __future__ import annotations

import argparse
import csv
import math
import re
import unicodedata
from collections import Counter
from dataclasses import dataclass, field
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
BASE_INPUTS = [
    ROOT / "docs" / "KeywordStats_4_5_2026.csv",
    ROOT / "docs" / "Keyword Stats 2026-04-05 at 10_02_37.csv",
]
DEFAULT_OUTPUT_DIR = ROOT / "docs" / "keyword-research" / "2026-04-05"
SUPPLEMENTAL_INPUT_DIR = DEFAULT_OUTPUT_DIR / "Keywords"

SUPPORTED_LANGUAGES = {"en", "ar", "fr"}
GROWTH_LANGUAGES = {"es"}
WATCHLIST_LANGUAGES = {"zh", "it", "pt", "other"}

HOW_TO_MARKERS = {"how to", "comment ", "كيفية", "كيف ", "how do i"}
FREE_MARKERS = {"free", "gratis", "gratuit"}
ONLINE_MARKERS = {"online", "en ligne"}
PAGE_MARKERS = {"page", "pages", "pagina", "paginas", "página", "páginas"}
FILE_MARKERS = {"file", "files", "document", "documents", "archivo", "archivos", "fichier", "fichiers"}

SPLIT_MARKERS = {
    "split",
    "splitter",
    "splitpdf",
    "pdfsplit",
    "separate",
    "separator",
    "divide",
    "divider",
    "cut",
    "cutter",
    "slicer",
    "trimmer",
    "breaker",
    "unmerge",
    "dividir",
    "separar",
    "separa",
    "separador",
    "cortar",
    "diviser",
    "séparer",
    "separer",
    "fractionner",
    "decouper",
    "découper",
    "couper",
    "dividi",
    "تقسيم",
    "فصل",
    "拆分",
    "分割",
}
EXTRACT_MARKERS = {"extract", "extractor", "extraction", "extract pages", "استخراج"}
MERGE_MARKERS = {"merge", "merger", "combine", "join", "fusionner", "fusion", "دمج"}
COMPRESS_MARKERS = {"compress", "compressor", "compression", "reduce size", "reduce pdf", "ضغط"}
CONVERT_MARKERS = {"convert", "converter", "conversion", "to pdf", "pdf to", "تحويل"}
EDIT_MARKERS = {"edit", "editor", "editing", "software"}
IMAGE_TO_PDF_MARKERS = {"image pdf", "images to pdf", "image to pdf", "add image to pdf", "photo to pdf", "jpg to pdf", "png to pdf"}
PDF_TOOL_MARKERS = {"pdf tools", "tool pdf", "pdf tool"}
PDF_TO_WORD_MARKERS = {"pdf to word", "pdf to doc", "pdf to docx", "convert pdf to word"}
WORD_TO_PDF_MARKERS = {"word to pdf", "doc to pdf", "docx to pdf", "convert word to pdf"}
OCR_MARKERS = {
    "ocr",
    "text recognition",
    "extract text from image",
    "extract text from pdf",
    "image to text",
    "pdf to text",
    "scan to text",
    "optical character recognition",
    "استخراج النص",
}

SPANISH_MARKERS = {"dividir", "separar", "separa", "separador", "gratis", "cortar"}
FRENCH_MARKERS = {"diviser", "séparer", "separer", "fractionner", "decouper", "découper", "couper", "gratuit"}
ITALIAN_MARKERS = {"dividi"}
PORTUGUESE_MARKERS = {"separador"}

SPLIT_VALID_PATTERNS = [
    re.compile(r"^(?:online )?split pdf(?: free| online| free online| pages| pages free| file| files| document)?$"),
    re.compile(r"^pdf split(?: online| free)?$"),
    re.compile(r"^pdf splitter(?: online| free| free online)?$"),
    re.compile(r"^splitter pdf$"),
    re.compile(r"^separate pdf(?: pages| files| free| pages free)?$"),
    re.compile(r"^pdf separate(?: pages)?$"),
    re.compile(r"^pdf separator$"),
    re.compile(r"^pdf page separator$"),
    re.compile(r"^cut pdf(?: pages)?$"),
    re.compile(r"^pdf cutter(?: online)?$"),
    re.compile(r"^pdf divider$"),
    re.compile(r"^unmerge pdf(?: free| online)?$"),
    re.compile(r"^dividir pdf(?: gratis| online)?$"),
    re.compile(r"^separar pdf$"),
    re.compile(r"^separa pdf$"),
    re.compile(r"^separador de pdf$"),
    re.compile(r"^cortar pdf$"),
    re.compile(r"^diviser pdf$"),
    re.compile(r"^séparer pdf$"),
    re.compile(r"^separer pdf$"),
    re.compile(r"^fractionner pdf$"),
    re.compile(r"^decouper pdf$"),
    re.compile(r"^découper pdf$"),
    re.compile(r"^couper pdf$"),
    re.compile(r"^pdfsplit$"),
    re.compile(r"^splitpdf$"),
    re.compile(r"^(?:拆分pdf|pdf拆分|分割pdf|pdf分割)$"),
]
EXTRACT_VALID_PATTERNS = [
    re.compile(r"^extract pages? from pdf$"),
    re.compile(r"^pdf extractor$"),
    re.compile(r"^extract pdf$"),
    re.compile(r"^extract pdf pages$"),
    re.compile(r"^pdf extract(?:or)?$"),
]
MERGE_VALID_PATTERNS = [
    re.compile(r"^merge pdf(?: files| documents| free| online)?$"),
    re.compile(r"^pdf merge$"),
    re.compile(r"^pdf merger$"),
    re.compile(r"^دمج pdf$"),
]
COMPRESS_VALID_PATTERNS = [
    re.compile(r"^compress pdf(?: file| document| online| free| online free)?$"),
    re.compile(r"^pdf compressor(?: free| online)?$"),
    re.compile(r"^pdf compression$"),
    re.compile(r"^ضغط pdf$"),
]
CONVERSION_VALID_PATTERNS = [
    re.compile(r"^pdf converter$"),
    re.compile(r"^convert (?:file|file type|document|documents|image|images|photo|photos|word|doc|docx|excel|xls|xlsx|ppt|pptx|powerpoint|html|text|txt) to pdf$"),
    re.compile(r"^(?:word|doc|docx|excel|xls|xlsx|ppt|pptx|powerpoint|html|image|images|photo|photos|jpg|jpeg|png) to pdf$"),
    re.compile(r"^pdf to (?:word|excel|ppt|pptx|powerpoint|images?|jpg|jpeg|png)$"),
]
EDITOR_VALID_PATTERNS = [
    re.compile(r"^pdf editor$"),
    re.compile(r"^edit pdf$"),
    re.compile(r"^pdf editing software$"),
    re.compile(r"^online pdf editor$"),
]
IMAGE_TO_PDF_VALID_PATTERNS = [
    re.compile(r"^image pdf$"),
    re.compile(r"^image to pdf$"),
    re.compile(r"^images to pdf$"),
    re.compile(r"^add image to pdf(?: document)?$"),
    re.compile(r"^photo to pdf$"),
    re.compile(r"^jpg to pdf$"),
    re.compile(r"^png to pdf$"),
]
PDF_TO_WORD_VALID_PATTERNS = [
    re.compile(r"^pdf to (?:word|doc|docx)$"),
    re.compile(r"^convert pdf to (?:word|doc|docx)$"),
    re.compile(r"^تحويل pdf (?:الى|إلى) (?:word|وورد)$"),
    re.compile(r"^تحويل من pdf (?:الى|إلى) (?:word|وورد)$"),
    re.compile(r"^(?:pdf|بي دي اف) (?:الى|إلى) (?:word|وورد)$"),
]
WORD_TO_PDF_VALID_PATTERNS = [
    re.compile(r"^(?:word|doc|docx) to pdf$"),
    re.compile(r"^convert (?:word|doc|docx) to pdf$"),
    re.compile(r"^تحويل (?:word|وورد|doc|docx) (?:الى|إلى) pdf$"),
    re.compile(r"^تحويل من (?:word|وورد|doc|docx) (?:الى|إلى) pdf$"),
]
OCR_VALID_PATTERNS = [
    re.compile(r"^ocr(?: pdf| image| scanner)?$"),
    re.compile(r"^text recognition$"),
    re.compile(r"^extract text from (?:image|pdf|scan|scanned pdf)$"),
    re.compile(r"^image to text$"),
    re.compile(r"^pdf to text$"),
    re.compile(r"^scan to text$"),
    re.compile(r"^optical character recognition$"),
    re.compile(r"^استخراج النص من (?:pdf|صورة)$"),
    re.compile(r"^تحويل (?:pdf|صورة) (?:الى|إلى) نص$"),
]

BRAND_PATTERNS = {
    "ilovepdf": re.compile(r"\bi\s*love\s*pdf\b|\bilovepdf\b", re.IGNORECASE),
    "smallpdf": re.compile(r"\bsmall\s*pdf\b|\bsmallpdf\b", re.IGNORECASE),
    "sejda": re.compile(r"\bsejda\b", re.IGNORECASE),
    "adobe": re.compile(r"\badobe\b|\bacrobat\b", re.IGNORECASE),
    "cutepdf": re.compile(r"\bcute\s*pdf\b|\bcutepdf\b", re.IGNORECASE),
    "pdf24": re.compile(r"\bpdf\s*24\b|\bpdf24\b", re.IGNORECASE),
}

AMBIGUOUS_EXACT = {
    "split",
    "pdf",
    "pd f",
    "pdf file",
    "pdf format",
    "pdf online",
    "split pages",
    "split online",
    "page separator",
    "pdf to split",
    "pdf smart",
}

CLUSTER_METADATA = {
    "split-pdf": {
        "label": "Split PDF",
        "recommended_target": "/tools/split-pdf",
        "target_type": "live_tool",
        "implementation_note": "Prioritize this existing landing page with unbranded transactional terms and page-focused variants.",
    },
    "extract-pages": {
        "label": "Extract Pages",
        "recommended_target": "/tools/extract-pages",
        "target_type": "live_tool",
        "implementation_note": "Use as a secondary page cluster for extraction-specific and page-removal intent.",
    },
    "merge-pdf": {
        "label": "Merge PDF",
        "recommended_target": "/tools/merge-pdf",
        "target_type": "live_tool",
        "implementation_note": "Target merge-specific queries separately from split keywords to avoid mixed intent pages.",
    },
    "compress-pdf": {
        "label": "Compress PDF",
        "recommended_target": "/tools/compress-pdf",
        "target_type": "live_tool",
        "implementation_note": "This cluster broadens reach beyond split and should be treated as a parallel priority pillar.",
    },
    "pdf-to-word": {
        "label": "PDF to Word",
        "recommended_target": "/tools/pdf-to-word",
        "target_type": "live_tool",
        "implementation_note": "Map direct PDF-to-Word conversion intent to the existing converter page rather than a generic conversion hub.",
    },
    "word-to-pdf": {
        "label": "Word to PDF",
        "recommended_target": "/tools/word-to-pdf",
        "target_type": "live_tool",
        "implementation_note": "Route Word-to-PDF terms to the dedicated converter page because the intent is specific and high value.",
    },
    "ocr": {
        "label": "OCR / Text Extraction",
        "recommended_target": "/tools/ocr",
        "target_type": "live_tool",
        "implementation_note": "Send OCR and text-extraction intent to the OCR tool page instead of mixing it into broad AI copy.",
    },
    "pdf-conversion": {
        "label": "PDF Conversion Hub",
        "recommended_target": "homepage-or-future-conversion-hub",
        "target_type": "hub_or_future_page",
        "implementation_note": "Use these keywords to justify a collection page for generic converter intent.",
    },
    "pdf-editor": {
        "label": "PDF Editor",
        "recommended_target": "/tools/pdf-editor",
        "target_type": "live_tool",
        "implementation_note": "Position editor and editing-software terms on the live PDF editor page.",
    },
    "images-to-pdf": {
        "label": "Images to PDF",
        "recommended_target": "/tools/images-to-pdf",
        "target_type": "live_tool",
        "implementation_note": "Capture image-to-PDF phrasing and upload intent on the existing converter tool.",
    },
    "mixed-pdf-operations": {
        "label": "Mixed PDF Operations",
        "recommended_target": "homepage-or-future-pdf-tools-hub",
        "target_type": "hub_or_future_page",
        "implementation_note": "Mixed split-and-merge intent should point to a tools hub, not a single-action landing page.",
    },
    "pdf-tools-hub": {
        "label": "PDF Tools Hub",
        "recommended_target": "homepage-or-future-pdf-tools-hub",
        "target_type": "hub_or_future_page",
        "implementation_note": "Reserve this cluster for clear hub-style terms such as pdf tools.",
    },
    "unclear": {
        "label": "Manual Review",
        "recommended_target": "manual-review",
        "target_type": "manual_review",
        "implementation_note": "Keep unclear terms out of the primary portfolio until manually validated.",
    },
}

RECOMMENDATION_ORDER = {
    "target_now": 0,
    "target_after_localization": 1,
    "supporting_content": 2,
    "watchlist": 3,
    "exclude": 4,
}


@dataclass
class SourceRow:
    keyword: str
    normalized: str
    source_name: str
    source_path: str
    volume: int
    raw_metric_name: str
    competition: str = ""
    competition_index: int = 0
    raw_trends: str = ""


@dataclass
class KeywordAggregate:
    keyword: str
    normalized: str
    source_names: set[str] = field(default_factory=set)
    source_paths: set[str] = field(default_factory=set)
    file1_impressions: int = 0
    file2_avg_monthly_searches: int = 0
    competitions: set[str] = field(default_factory=set)
    competition_index_max: int = 0
    raw_trends: list[str] = field(default_factory=list)


def clean_int(value: str | None) -> int:
    if not value:
        return 0
    digits = re.sub(r"[^0-9]", "", str(value))
    return int(digits) if digits else 0


def normalize_keyword(value: str) -> str:
    text = unicodedata.normalize("NFKC", value or "")
    text = re.sub(r"[\u200e\u200f\u202a-\u202e\u2066-\u2069]", "", text)
    text = text.lower().replace("_", " ")
    text = text.replace("&", " and ")
    text = re.sub(r"[|/+]+", " ", text)
    text = re.sub(r"[^\w\s\u0600-\u06FF\u4E00-\u9FFF-]", " ", text, flags=re.UNICODE)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def contains_any(text: str, markers: set[str]) -> bool:
    tokens = set(text.split())
    for marker in markers:
        if re.search(r"[\u0600-\u06FF\u4E00-\u9FFF]", marker):
            if marker in text:
                return True
            continue
        if " " in marker and marker in text:
            return True
        if marker in tokens:
            return True
    return False


def has_token_or_phrase(keyword: str, markers: set[str]) -> bool:
    return contains_any(keyword, markers)


def matches_any_pattern(keyword: str, patterns: list[re.Pattern[str]]) -> bool:
    return any(pattern.search(keyword) for pattern in patterns)


def discover_default_inputs() -> list[Path]:
    input_paths = [path for path in BASE_INPUTS if path.exists()]
    seen_names = {path.name for path in input_paths}

    if SUPPLEMENTAL_INPUT_DIR.exists():
        for path in sorted(SUPPLEMENTAL_INPUT_DIR.glob("*.csv")):
            if path.name in seen_names:
                continue
            input_paths.append(path)
            seen_names.add(path.name)

    return input_paths


DEFAULT_INPUTS = discover_default_inputs()


def strip_informational_prefix(keyword: str) -> str:
    for prefix in ("how to ", "comment ", "كيفية ", "كيف ", "how do i "):
        if keyword.startswith(prefix):
            return keyword[len(prefix):].strip()
    return keyword


def detect_language(keyword: str) -> str:
    if re.search(r"[\u0600-\u06FF]", keyword):
        return "ar"
    if re.search(r"[\u4E00-\u9FFF]", keyword):
        return "zh"

    if has_token_or_phrase(keyword, FRENCH_MARKERS):
        return "fr"
    if has_token_or_phrase(keyword, SPANISH_MARKERS):
        return "es"
    if has_token_or_phrase(keyword, ITALIAN_MARKERS):
        return "it"
    if has_token_or_phrase(keyword, PORTUGUESE_MARKERS):
        return "pt"
    return "en"


def detect_brands(keyword: str) -> list[str]:
    hits = []
    for brand, pattern in BRAND_PATTERNS.items():
        if pattern.search(keyword):
            hits.append(brand)
    return sorted(hits)


def extract_modifiers(keyword: str, brand_hits: list[str]) -> list[str]:
    modifiers = []
    if contains_any(keyword, HOW_TO_MARKERS):
        modifiers.append("how_to")
    if contains_any(keyword, FREE_MARKERS):
        modifiers.append("free")
    if contains_any(keyword, ONLINE_MARKERS):
        modifiers.append("online")
    if contains_any(keyword, PAGE_MARKERS):
        modifiers.append("pages")
    if contains_any(keyword, FILE_MARKERS):
        modifiers.append("files")
    if brand_hits:
        modifiers.append("brand")
    return modifiers


def classify_cluster(keyword: str) -> str:
    has_pdf_to_word = contains_any(keyword, PDF_TO_WORD_MARKERS) or matches_any_pattern(keyword, PDF_TO_WORD_VALID_PATTERNS)
    has_word_to_pdf = contains_any(keyword, WORD_TO_PDF_MARKERS) or matches_any_pattern(keyword, WORD_TO_PDF_VALID_PATTERNS)
    has_ocr = contains_any(keyword, OCR_MARKERS) or matches_any_pattern(keyword, OCR_VALID_PATTERNS)
    has_split = contains_any(keyword, SPLIT_MARKERS)
    has_extract = contains_any(keyword, EXTRACT_MARKERS)
    has_merge = contains_any(keyword, MERGE_MARKERS)
    has_compress = contains_any(keyword, COMPRESS_MARKERS)
    has_convert = contains_any(keyword, CONVERT_MARKERS)
    has_edit = contains_any(keyword, EDIT_MARKERS)
    has_image_to_pdf = contains_any(keyword, IMAGE_TO_PDF_MARKERS)
    has_pdf_tool = contains_any(keyword, PDF_TOOL_MARKERS)

    if has_pdf_to_word:
        return "pdf-to-word"
    if has_word_to_pdf:
        return "word-to-pdf"
    if has_ocr:
        return "ocr"
    if has_split and has_merge:
        return "mixed-pdf-operations"
    if has_extract:
        return "extract-pages"
    if has_split or "to pages" in keyword:
        return "split-pdf"
    if has_merge:
        return "merge-pdf"
    if has_compress:
        return "compress-pdf"
    if has_image_to_pdf:
        return "images-to-pdf"
    if has_edit:
        return "pdf-editor"
    if has_convert or keyword.startswith("pdf to ") or keyword.endswith(" to pdf"):
        return "pdf-conversion"
    if has_pdf_tool:
        return "pdf-tools-hub"
    return "unclear"


def repeated_phrase(tokens: list[str]) -> bool:
    if len(tokens) < 4:
        return False
    for size in range(1, len(tokens) // 2 + 1):
        if len(tokens) % size:
            continue
        chunk = tokens[:size]
        repeats = len(tokens) // size
        if repeats > 1 and chunk * repeats == tokens:
            return True
    return False


def detect_noise_reason(keyword: str, cluster: str, brand_hits: list[str], file1_impressions: int, file2_searches: int) -> str:
    tokens = keyword.split()

    if keyword in AMBIGUOUS_EXACT:
        return "too_broad_or_ambiguous"

    if keyword == "page separator":
        return "not_pdf_specific"

    if repeated_phrase(tokens):
        return "repeated_phrase_spam"

    if tokens and max(Counter(tokens).values()) >= 3 and len(set(tokens)) <= 3:
        return "repeated_tokens_spam"

    if brand_hits:
        return ""

    if "pdf" not in keyword and cluster not in {"pdf-tools-hub", "pdf-editor", "images-to-pdf", "ocr"}:
        return "not_pdf_specific"

    if cluster == "unclear" and max(file1_impressions, file2_searches) < 500:
        return "unclear_low_value"

    if keyword.startswith("pd f") or keyword.endswith("pd f"):
        return "malformed_keyword"

    if cluster == "unclear":
        return "manual_review_required"

    cluster_phrase_issue = detect_cluster_phrase_issue(keyword, cluster)
    if cluster_phrase_issue:
        return cluster_phrase_issue

    return ""


def detect_cluster_phrase_issue(keyword: str, cluster: str) -> str:
    candidate = strip_informational_prefix(keyword)

    if cluster == "split-pdf":
        if candidate.count("pdf") > 1 and candidate not in {"pdf split", "pdf splitter", "pdf separator", "pdf page separator", "pdf separate", "pdf divider", "pdf cutter"}:
            return "unnatural_cluster_phrase"
        if any(pattern.search(candidate) for pattern in SPLIT_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "extract-pages":
        if any(pattern.search(candidate) for pattern in EXTRACT_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "merge-pdf":
        if candidate.count("pdf") > 1:
            return "unnatural_cluster_phrase"
        if any(pattern.search(candidate) for pattern in MERGE_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "compress-pdf":
        if candidate.count("pdf") > 1 or candidate.count("compress") > 1 or candidate.count("compressor") > 1:
            return "unnatural_cluster_phrase"
        if any(pattern.search(candidate) for pattern in COMPRESS_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "pdf-to-word":
        if any(pattern.search(candidate) for pattern in PDF_TO_WORD_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "word-to-pdf":
        if any(pattern.search(candidate) for pattern in WORD_TO_PDF_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "ocr":
        if any(pattern.search(candidate) for pattern in OCR_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "pdf-conversion":
        if candidate == "pdf converter":
            return ""
        if candidate.count("pdf") > 1:
            return "unnatural_cluster_phrase"
        if any(pattern.search(candidate) for pattern in CONVERSION_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "pdf-editor":
        if candidate.count("pdf") > 1:
            return "unnatural_cluster_phrase"
        if any(pattern.search(candidate) for pattern in EDITOR_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "images-to-pdf":
        if candidate.count("pdf") > 1:
            return "unnatural_cluster_phrase"
        if any(pattern.search(candidate) for pattern in IMAGE_TO_PDF_VALID_PATTERNS):
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "mixed-pdf-operations":
        if candidate in {"pdf split and merge", "split and merge pdf"}:
            return ""
        return "unnatural_cluster_phrase"

    if cluster == "pdf-tools-hub":
        if "pdf tools" in keyword:
            return ""
        return "unnatural_cluster_phrase"

    return ""


def detect_intent(keyword: str, brand_hits: list[str]) -> str:
    if brand_hits:
        return "competitor"
    if contains_any(keyword, HOW_TO_MARKERS):
        return "informational"
    if "pdf tools" in keyword:
        return "commercial_investigation"
    return "transactional"


def market_bucket(language: str) -> str:
    if language == "en":
        return "core_en"
    if language == "es":
        return "growth_es"
    if language == "ar":
        return "expansion_ar"
    if language == "fr":
        return "expansion_fr"
    return "watchlist_other"


def recommendation_for(language: str, intent: str, cluster: str, brand_hits: list[str], noise_reason: str) -> tuple[str, str]:
    if noise_reason:
        return "exclude", noise_reason

    if brand_hits:
        return "watchlist", "competitor_branded"

    if language in WATCHLIST_LANGUAGES:
        return "watchlist", "unsupported_language_market"

    if language in GROWTH_LANGUAGES:
        if intent == "informational":
            return "supporting_content", "spanish_content_after_localization"
        return "target_after_localization", "spanish_localization_required"

    if cluster == "pdf-tools-hub":
        return "supporting_content", "homepage_or_tools_hub"

    if intent == "informational":
        return "supporting_content", "blog_or_faq_support"

    return "target_now", "mapped_to_live_page_or_current_i18n"


def score_keyword(file1_impressions: int, file2_searches: int, max_file1: int, max_file2: int) -> float:
    file1_score = 0.0
    file2_score = 0.0
    if max_file1:
        file1_score = math.log10(file1_impressions + 1) / math.log10(max_file1 + 1)
    if max_file2:
        file2_score = math.log10(file2_searches + 1) / math.log10(max_file2 + 1)
    return round(file1_score * 45 + file2_score * 55, 2)


def load_keyword_stats(path: Path) -> list[SourceRow]:
    rows: list[SourceRow] = []
    with path.open("r", encoding="utf-8-sig", newline="") as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            keyword = (row.get("Keyword") or "").strip()
            if not keyword:
                continue
            rows.append(
                SourceRow(
                    keyword=keyword,
                    normalized=normalize_keyword(keyword),
                    source_name="keyword_trends_export",
                    source_path=str(path.relative_to(ROOT)).replace("\\", "/"),
                    volume=clean_int(row.get("Impressions")),
                    raw_metric_name="impressions",
                    raw_trends=(row.get("Trends") or "").strip(),
                )
            )
    return rows


def load_keyword_planner(path: Path) -> list[SourceRow]:
    rows: list[SourceRow] = []
    with path.open("r", encoding="utf-16") as handle:
        lines = handle.read().splitlines()

    reader = csv.DictReader(lines[2:], delimiter="\t")
    for row in reader:
        keyword = (row.get("Keyword") or "").strip()
        if not keyword:
            continue
        rows.append(
            SourceRow(
                keyword=keyword,
                normalized=normalize_keyword(keyword),
                source_name="keyword_planner_export",
                source_path=str(path.relative_to(ROOT)).replace("\\", "/"),
                volume=clean_int(row.get("Avg. monthly searches")),
                raw_metric_name="avg_monthly_searches",
                competition=(row.get("Competition") or "").strip(),
                competition_index=clean_int(row.get("Competition (indexed value)")),
            )
        )
    return rows


def aggregate_rows(rows: list[SourceRow]) -> list[KeywordAggregate]:
    aggregates: dict[str, KeywordAggregate] = {}

    for row in rows:
        if not row.normalized:
            continue

        aggregate = aggregates.get(row.normalized)
        if aggregate is None:
            aggregate = KeywordAggregate(keyword=row.keyword, normalized=row.normalized)
            aggregates[row.normalized] = aggregate

        current_best = max(aggregate.file1_impressions, aggregate.file2_avg_monthly_searches)
        incoming_best = row.volume
        if incoming_best > current_best:
            aggregate.keyword = row.keyword

        aggregate.source_names.add(row.source_name)
        aggregate.source_paths.add(row.source_path)
        if row.source_name == "keyword_trends_export":
            aggregate.file1_impressions += row.volume
            if row.raw_trends:
                aggregate.raw_trends.append(row.raw_trends)
        else:
            aggregate.file2_avg_monthly_searches += row.volume
            if row.competition:
                aggregate.competitions.add(row.competition)
            aggregate.competition_index_max = max(aggregate.competition_index_max, row.competition_index)

    return list(aggregates.values())


def build_keyword_rows(aggregates: list[KeywordAggregate]) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
    max_file1 = max((item.file1_impressions for item in aggregates), default=0)
    max_file2 = max((item.file2_avg_monthly_searches for item in aggregates), default=0)
    rows: list[dict[str, str]] = []
    excluded: list[dict[str, str]] = []

    for aggregate in aggregates:
        normalized = aggregate.normalized
        brand_hits = detect_brands(normalized)
        language = detect_language(normalized)
        cluster = classify_cluster(normalized)
        metadata = CLUSTER_METADATA[cluster]
        modifiers = extract_modifiers(normalized, brand_hits)
        noise_reason = detect_noise_reason(
            normalized,
            cluster,
            brand_hits,
            aggregate.file1_impressions,
            aggregate.file2_avg_monthly_searches,
        )
        intent = detect_intent(normalized, brand_hits)
        recommendation, rationale = recommendation_for(language, intent, cluster, brand_hits, noise_reason)
        priority_score = score_keyword(
            aggregate.file1_impressions,
            aggregate.file2_avg_monthly_searches,
            max_file1,
            max_file2,
        )

        row = {
            "keyword": aggregate.keyword,
            "normalized_keyword": normalized,
            "language": language,
            "market_bucket": market_bucket(language),
            "intent": intent,
            "cluster": cluster,
            "cluster_label": metadata["label"],
            "recommended_target": metadata["recommended_target"],
            "target_type": metadata["target_type"],
            "recommendation": recommendation,
            "recommendation_reason": rationale,
            "priority_score": f"{priority_score:.2f}",
            "file1_impressions": str(aggregate.file1_impressions),
            "file2_avg_monthly_searches": str(aggregate.file2_avg_monthly_searches),
            "competition_levels": ", ".join(sorted(aggregate.competitions)),
            "competition_index_max": str(aggregate.competition_index_max),
            "brands": ", ".join(brand_hits),
            "modifiers": ", ".join(modifiers),
            "source_count": str(len(aggregate.source_names)),
            "sources": ", ".join(sorted(aggregate.source_names)),
            "source_paths": ", ".join(sorted(aggregate.source_paths)),
            "notes": metadata["implementation_note"],
        }

        if recommendation == "exclude":
            excluded.append(row)
        else:
            rows.append(row)

    rows.sort(
        key=lambda item: (
            RECOMMENDATION_ORDER[item["recommendation"]],
            -float(item["priority_score"]),
            item["normalized_keyword"],
        )
    )
    excluded.sort(key=lambda item: (-float(item["priority_score"]), item["normalized_keyword"]))

    for index, row in enumerate(rows, start=1):
        row["priority_rank"] = str(index)

    return rows, excluded


def build_cluster_rows(rows: list[dict[str, str]], excluded: list[dict[str, str]]) -> list[dict[str, str]]:
    grouped: dict[str, list[dict[str, str]]] = {}
    for row in rows + excluded:
        grouped.setdefault(row["cluster"], []).append(row)

    cluster_rows = []
    for cluster, items in grouped.items():
        metadata = CLUSTER_METADATA[cluster]
        targetable = [item for item in items if item["recommendation"] != "exclude"]
        sorted_items = sorted(items, key=lambda item: -float(item["priority_score"]))
        top_candidates = sorted(targetable, key=lambda item: -float(item["priority_score"]))
        top_item = top_candidates[0] if top_candidates else sorted_items[0]
        cluster_rows.append(
            {
                "cluster": cluster,
                "cluster_label": metadata["label"],
                "recommended_target": metadata["recommended_target"],
                "target_type": metadata["target_type"],
                "cluster_score": f"{sum(float(item['priority_score']) for item in targetable):.2f}",
                "keywords_total": str(len(items)),
                "target_now_keywords": str(sum(item["recommendation"] == "target_now" for item in items)),
                "target_after_localization_keywords": str(sum(item["recommendation"] == "target_after_localization" for item in items)),
                "supporting_content_keywords": str(sum(item["recommendation"] == "supporting_content" for item in items)),
                "watchlist_keywords": str(sum(item["recommendation"] == "watchlist" for item in items)),
                "excluded_keywords": str(sum(item["recommendation"] == "exclude" for item in items)),
                "top_keyword": top_item["keyword"],
                "top_language": top_item["language"],
                "file1_impressions": str(sum(int(item["file1_impressions"]) for item in items)),
                "file2_avg_monthly_searches": str(sum(int(item["file2_avg_monthly_searches"]) for item in items)),
                "implementation_note": metadata["implementation_note"],
            }
        )

    cluster_rows.sort(key=lambda item: -float(item["cluster_score"]))
    return cluster_rows


def to_markdown_table(rows: list[dict[str, str]], headers: list[tuple[str, str]]) -> str:
    if not rows:
        return "_No rows._"
    header_row = "| " + " | ".join(label for _, label in headers) + " |"
    separator = "| " + " | ".join("---" for _ in headers) + " |"
    body = [
        "| " + " | ".join(str(row.get(key, "")) for key, _ in headers) + " |"
        for row in rows
    ]
    return "\n".join([header_row, separator, *body])


def write_csv(path: Path, rows: list[dict[str, str]], fieldnames: list[str]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)


def write_summary(
    output_path: Path,
    input_paths: list[Path],
    raw_rows: list[SourceRow],
    aggregates: list[KeywordAggregate],
    rows: list[dict[str, str]],
    excluded: list[dict[str, str]],
    clusters: list[dict[str, str]],
) -> None:
    counts_by_recommendation = Counter(row["recommendation"] for row in rows)
    counts_by_recommendation.update(row["recommendation"] for row in excluded)

    visible_rows = [row for row in rows if row["recommendation"] != "watchlist"]
    language_counts = Counter(row["language"] for row in visible_rows)
    market_counts = Counter(row["market_bucket"] for row in visible_rows)

    top_target_now = [row for row in rows if row["recommendation"] == "target_now"][:15]
    top_localization = [row for row in rows if row["recommendation"] == "target_after_localization"][:15]
    top_supporting = [row for row in rows if row["recommendation"] == "supporting_content"][:12]
    top_watchlist = [row for row in rows if row["recommendation"] == "watchlist"][:10]
    top_clusters = clusters[:10]
    top_excluded = excluded[:10]

    input_paths_display = [str(path.relative_to(ROOT)).replace("\\", "/") for path in input_paths]
    input_list = "\n".join(f"- {path}" for path in input_paths_display)

    market_table_rows = [
        {
            "bucket": "core_en",
            "execution": "Target now",
            "notes": "Current product and current data both support this market immediately.",
            "count": str(market_counts.get("core_en", 0)),
        },
        {
            "bucket": "growth_es",
            "execution": "Target after localization",
            "notes": "Highest upside after English, but the site needs Spanish landing-page coverage first.",
            "count": str(market_counts.get("growth_es", 0)),
        },
        {
            "bucket": "expansion_fr",
            "execution": "Target now where demand exists",
            "notes": "Supported in product and ready for selective rollout where the uploads show clear intent.",
            "count": str(market_counts.get("expansion_fr", 0)),
        },
        {
            "bucket": "expansion_ar",
            "execution": "Target supported terms, expand with native research",
            "notes": "Product support exists and the latest uploads surface Arabic conversion intent, but category coverage still needs broader native-language research.",
            "count": str(market_counts.get("expansion_ar", 0)),
        },
    ]

    content = f"""# Keyword Portfolio - 2026-04-05

Generated with `scripts/build_keyword_portfolio.py` from the latest Google Ads exports.

## Source Files

{input_list}

## Source Overview

- Raw rows processed: {len(raw_rows)}
- Unique normalized keywords: {len(aggregates)}
- Included or watchlist keywords: {len(rows)}
- Excluded keywords: {len(excluded)}
- `target_now`: {counts_by_recommendation.get('target_now', 0)}
- `target_after_localization`: {counts_by_recommendation.get('target_after_localization', 0)}
- `supporting_content`: {counts_by_recommendation.get('supporting_content', 0)}
- `watchlist`: {counts_by_recommendation.get('watchlist', 0)}
- `exclude`: {counts_by_recommendation.get('exclude', 0)}

## Recommended Market Mix

{to_markdown_table(market_table_rows, [('bucket', 'Market Bucket'), ('execution', 'Execution'), ('count', 'Keywords'), ('notes', 'Notes')])}

## Language Distribution (Non-Watchlist)

{to_markdown_table([
    {'language': language, 'count': str(count)}
    for language, count in sorted(language_counts.items(), key=lambda item: (-item[1], item[0]))
], [('language', 'Language'), ('count', 'Keywords')])}

## Priority Clusters

{to_markdown_table(top_clusters, [
    ('cluster_label', 'Cluster'),
    ('recommended_target', 'Recommended Target'),
    ('cluster_score', 'Cluster Score'),
    ('target_now_keywords', 'Target Now'),
    ('target_after_localization_keywords', 'Target After Localization'),
    ('watchlist_keywords', 'Watchlist'),
    ('top_keyword', 'Top Keyword'),
])}

## Top Keywords to Target Now

{to_markdown_table(top_target_now, [
    ('priority_rank', 'Rank'),
    ('keyword', 'Keyword'),
    ('language', 'Language'),
    ('cluster_label', 'Cluster'),
    ('file2_avg_monthly_searches', 'Avg Monthly Searches'),
    ('file1_impressions', 'Impressions'),
    ('priority_score', 'Score'),
    ('recommended_target', 'Target'),
])}

## Spanish Growth Keywords

{to_markdown_table(top_localization, [
    ('priority_rank', 'Rank'),
    ('keyword', 'Keyword'),
    ('cluster_label', 'Cluster'),
    ('file2_avg_monthly_searches', 'Avg Monthly Searches'),
    ('file1_impressions', 'Impressions'),
    ('priority_score', 'Score'),
    ('recommendation_reason', 'Why'),
])}

## Supporting Content Keywords

{to_markdown_table(top_supporting, [
    ('priority_rank', 'Rank'),
    ('keyword', 'Keyword'),
    ('language', 'Language'),
    ('cluster_label', 'Cluster'),
    ('priority_score', 'Score'),
    ('recommendation_reason', 'Why'),
])}

## Watchlist

{to_markdown_table(top_watchlist, [
    ('priority_rank', 'Rank'),
    ('keyword', 'Keyword'),
    ('language', 'Language'),
    ('brands', 'Brands'),
    ('priority_score', 'Score'),
    ('recommendation_reason', 'Why'),
])}

## Excluded Samples

{to_markdown_table(top_excluded, [
    ('keyword', 'Keyword'),
    ('language', 'Language'),
    ('cluster_label', 'Cluster'),
    ('priority_score', 'Score'),
    ('recommendation_reason', 'Exclusion Reason'),
])}

## Implementation Notes

- The combined exports now show immediate live-page opportunities across `split pdf`, `compress pdf`, `merge pdf`, `pdf to word`, `word to pdf`, and adjacent OCR/conversion intent.
- Spanish is the strongest growth market in the uploaded data, but those keywords are intentionally separated into `target_after_localization` until the site ships Spanish landing pages.
- Arabic and French remain strategically valid because the product already supports both languages. Use the current dataset for targeted pages now, then supplement with native-language research before scaling site-wide coverage.
- Competitor-branded phrases are kept in the watchlist only. They should not be mixed into the core unbranded landing-page portfolio.
- Generic or malformed terms are excluded when they are too broad, not PDF-specific, or obviously generated noise from Keyword Planner suggestions.

## Output Files

- `prioritized_keywords.csv` - master portfolio with recommendation status, market bucket, cluster mapping, and source metrics.
- `keyword_clusters.csv` - cluster-level rollup for page planning.
- `excluded_keywords.csv` - excluded or noisy terms with reasons.
"""

    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(content, encoding="utf-8")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Build a keyword portfolio from Google Ads exports.")
    parser.add_argument(
        "--output-dir",
        default=str(DEFAULT_OUTPUT_DIR),
        help="Directory where the generated deliverables will be written.",
    )
    parser.add_argument(
        "--inputs",
        nargs="*",
        default=[str(path) for path in DEFAULT_INPUTS],
        help="Input export files. Supports the repository's CSV and UTF-16 TSV Google Ads formats.",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    input_paths = [Path(path) if Path(path).is_absolute() else ROOT / path for path in args.inputs]
    output_dir = Path(args.output_dir) if Path(args.output_dir).is_absolute() else ROOT / args.output_dir

    if not input_paths:
        raise FileNotFoundError("No keyword input files were found. Add exports under docs/keyword-research/2026-04-05/Keywords or pass --inputs explicitly.")

    raw_rows: list[SourceRow] = []
    for path in input_paths:
        if path.name == "KeywordStats_4_5_2026.csv":
            raw_rows.extend(load_keyword_stats(path))
        else:
            raw_rows.extend(load_keyword_planner(path))

    aggregates = aggregate_rows(raw_rows)
    rows, excluded = build_keyword_rows(aggregates)
    clusters = build_cluster_rows(rows, excluded)

    prioritized_fields = [
        "priority_rank",
        "recommendation",
        "recommendation_reason",
        "market_bucket",
        "keyword",
        "normalized_keyword",
        "language",
        "intent",
        "cluster",
        "cluster_label",
        "recommended_target",
        "target_type",
        "priority_score",
        "file2_avg_monthly_searches",
        "file1_impressions",
        "competition_levels",
        "competition_index_max",
        "brands",
        "modifiers",
        "source_count",
        "sources",
        "source_paths",
        "notes",
    ]
    excluded_fields = [
        "keyword",
        "normalized_keyword",
        "language",
        "intent",
        "cluster",
        "cluster_label",
        "priority_score",
        "file2_avg_monthly_searches",
        "file1_impressions",
        "brands",
        "modifiers",
        "recommendation_reason",
        "sources",
        "source_paths",
    ]
    cluster_fields = [
        "cluster",
        "cluster_label",
        "recommended_target",
        "target_type",
        "cluster_score",
        "keywords_total",
        "target_now_keywords",
        "target_after_localization_keywords",
        "supporting_content_keywords",
        "watchlist_keywords",
        "excluded_keywords",
        "top_keyword",
        "top_language",
        "file1_impressions",
        "file2_avg_monthly_searches",
        "implementation_note",
    ]

    write_csv(output_dir / "prioritized_keywords.csv", rows, prioritized_fields)
    write_csv(output_dir / "excluded_keywords.csv", excluded, excluded_fields)
    write_csv(output_dir / "keyword_clusters.csv", clusters, cluster_fields)
    write_summary(output_dir / "keyword_strategy.md", input_paths, raw_rows, aggregates, rows, excluded, clusters)

    print(f"Generated keyword portfolio in {output_dir}")
    print(f"Included rows: {len(rows)}")
    print(f"Excluded rows: {len(excluded)}")


if __name__ == "__main__":
    main()