#!/usr/bin/env python3

"""Build the Python docs for various branches and various languages.

Without any arguments builds docs for all active versions and
languages.

Environment variables for:

- `SENTRY_DSN` (Error reporting)
- `FASTLY_SERVICE_ID` / `FASTLY_TOKEN` (CDN purges)
- `PYTHON_DOCS_ENABLE_ANALYTICS` (Enable Plausible for online docs)

are read from the site configuration path for your platform
(/etc/xdg/docsbuild-scripts on linux) if available,
and can be overriden by writing a file to the user config dir
for your platform ($HOME/.config/docsbuild-scripts on linux).
The contents of the file is parsed as toml:

```toml
[env]
SENTRY_DSN = "https://0a0a0a0a0a0a0a0a0a0a0a@sentry.io/69420"
FASTLY_SERVICE_ID = "deadbeefdeadbeefdead"
FASTLY_TOKEN = "secureme!"
PYTHON_DOCS_ENABLE_ANALYTICS = "1"
```

Languages are stored in `config.toml` while versions are discovered
from the devguide.

-q selects "quick build", which means to build only HTML.

Translations are fetched from GitHub repositories according to PEP
545. `--languages` allows selecting translations, like `--languages
en` to just build the English documents.

This script was originally created by Georg Brandl in March 2010.
Modified by Benjamin Peterson to do CDN cache invalidation.
Modified by Julien Palard to build translations.

"""

from __future__ import annotations

import argparse
import concurrent.futures
import dataclasses
import datetime as dt
import filecmp
import json
import logging
import logging.handlers
import os
import re
import shlex
import shutil
import stat
import subprocess
import sys
import venv
from bisect import bisect_left as bisect
from contextlib import contextmanager, suppress
from pathlib import Path
from string import Template
from time import perf_counter, sleep
from urllib.parse import urljoin

import jinja2
import platformdirs
import tomlkit
import urllib3
import zc.lockfile

TYPE_CHECKING = False
if TYPE_CHECKING:
    from collections.abc import Collection, Iterator, Sequence, Set
    from typing import Literal

try:
    from os import EX_OK
    from os import EX_SOFTWARE as EX_FAILURE
except ImportError:
    EX_OK, EX_FAILURE = 0, 1

try:
    import sentry_sdk
except ImportError:
    sentry_sdk = None
else:
    sentry_sdk.init()

HERE = Path(__file__).resolve().parent


@dataclasses.dataclass(frozen=True, slots=True)
class Versions:
    _seq: Sequence[Version]

    def __iter__(self) -> Iterator[Version]:
        return iter(self._seq)

    def __reversed__(self) -> Iterator[Version]:
        return reversed(self._seq)

    @classmethod
    def from_json(cls, data: dict) -> Versions:
        """Load versions from the devguide's JSON representation."""
        permitted = ", ".join(sorted(Version.STATUSES | Version.SYNONYMS.keys()))

        versions = []
        for name, release in data.items():
            branch = release["branch"]
            status = release["status"]
            status = Version.SYNONYMS.get(status, status)
            if status not in Version.STATUSES:
                msg = (
                    f"Saw invalid version status {status!r}, "
                    f"expected to be one of {permitted}."
                )
                raise ValueError(msg)
            versions.append(Version(name=name, status=status, branch_or_tag=branch))

        return cls(sorted(versions, key=Version.as_tuple))

    def filter(self, branches: Sequence[str] = ()) -> Sequence[Version]:
        """Filter the given versions.

        If *branches* is given, only *versions* matching *branches* are returned.

        Else all live versions are returned (this means no EOL and no
        security-fixes branches).
        """
        if branches:
            branches = frozenset(branches)
            return [v for v in self if {v.name, v.branch_or_tag} & branches]
        return [v for v in self if v.status not in {"EOL", "security-fixes"}]

    @property
    def current_stable(self) -> Version:
        """Find the current stable CPython version."""
        return max((v for v in self if v.status == "stable"), key=Version.as_tuple)

    @property
    def current_dev(self) -> Version:
        """Find the current CPython version in development."""
        return max(self, key=Version.as_tuple)


@dataclasses.dataclass(frozen=True, kw_only=True, slots=True)
class Version:
    """Represents a CPython version and its documentation build dependencies."""

    name: str
    status: Literal["EOL", "security-fixes", "stable", "pre-release", "in development"]
    branch_or_tag: str

    STATUSES = {"EOL", "security-fixes", "stable", "pre-release", "in development"}

    # Those synonyms map branch status vocabulary found in the devguide
    # with our vocabulary.
    SYNONYMS = {
        "feature": "in development",
        "bugfix": "stable",
        "security": "security-fixes",
        "end-of-life": "EOL",
        "prerelease": "pre-release",
    }

    def __eq__(self, other: Version) -> bool:
        return self.name == other.name

    @property
    def requirements(self) -> list[str]:
        """Generate the right requirements for this version.

        Since CPython 3.8 a Doc/requirements.txt file can be used.

        In case the Doc/requirements.txt is absent or wrong (a
        sub-dependency broke), use this function to override it.

        See https://github.com/python/cpython/issues/91294
        See https://github.com/python/cpython/issues/91483

        """
        dependencies = [
            "-rrequirements.txt",
            "jieba",  # To improve zh search.
            "PyStemmer~=2.2.0",  # To improve performance for word stemming.
        ]
        if self.as_tuple() >= (3, 11):
            return dependencies
        if self.as_tuple() >= (3, 8):
            # Restore the imghdr module for Python 3.8-3.10.
            return dependencies + ["standard-imghdr"]

        # Requirements/constraints for Python 3.7 and older, pre-requirements.txt
        reqs = [
            "alabaster<0.7.12",
            "blurb<1.2",
            "docutils<=0.17.1",
            "jieba",
            "jinja2<3.1",
            "python-docs-theme<=2023.3.1",
            "sphinxcontrib-applehelp<=1.0.2",
            "sphinxcontrib-devhelp<=1.0.2",
            "sphinxcontrib-htmlhelp<=2.0",
            "sphinxcontrib-jsmath<=1.0.1",
            "sphinxcontrib-qthelp<=1.0.3",
            "sphinxcontrib-serializinghtml<=1.1.5",
            "standard-imghdr",
        ]
        if self.name in {"3.7", "3.6", "2.7"}:
            return reqs + ["sphinx==2.3.1"]
        if self.name == "3.5":
            return reqs + ["sphinx==1.8.4", "standard-pipes"]
        raise ValueError("unreachable")

    @property
    def changefreq(self) -> str:
        """Estimate this version change frequency, for the sitemap."""
        return {"EOL": "never", "security-fixes": "yearly"}.get(self.status, "daily")

    def as_tuple(self) -> tuple[int, ...]:
        """This version name as tuple, for easy comparisons."""
        return version_to_tuple(self.name)

    @property
    def url(self) -> str:
        """The doc URL of this version in production."""
        return f"https://docs.python.org/{self.name}/"

    @property
    def title(self) -> str:
        """The title of this version's doc, for the sidebar."""
        return f"Python {self.name} ({self.status})"

    @property
    def picker_label(self) -> str:
        """Forge the label of a version picker."""
        if self.status == "in development":
            return f"dev ({self.name})"
        if self.status == "pre-release":
            return f"pre ({self.name})"
        return self.name


@dataclasses.dataclass(frozen=True, slots=True)
class Languages:
    _seq: Sequence[Language]

    def __iter__(self) -> Iterator[Language]:
        return iter(self._seq)

    def __reversed__(self) -> Iterator[Language]:
        return reversed(self._seq)

    @classmethod
    def from_json(cls, defaults: dict, languages: dict) -> Languages:
        default_translated_name = defaults.get("translated_name", "")
        default_in_prod = defaults.get("in_prod", True)
        default_sphinxopts = defaults.get("sphinxopts", [])
        default_html_only = defaults.get("html_only", False)
        langs = [
            Language(
                iso639_tag=iso639_tag,
                name=section["name"],
                translated_name=section.get("translated_name", default_translated_name),
                in_prod=section.get("in_prod", default_in_prod),
                sphinxopts=section.get("sphinxopts", default_sphinxopts),
                html_only=section.get("html_only", default_html_only),
            )
            for iso639_tag, section in languages.items()
        ]
        return cls(langs)

    def filter(self, language_tags: Sequence[str] = ()) -> Sequence[Language]:
        """Filter a sequence of languages according to --languages."""
        if language_tags:
            language_tags = frozenset(language_tags)
            return [l for l in self if l.tag in language_tags]  # NoQA: E741
        return list(self)


@dataclasses.dataclass(order=True, frozen=True, kw_only=True)
class Language:
    iso639_tag: str
    name: str
    translated_name: str
    in_prod: bool
    sphinxopts: Sequence[str]
    html_only: bool = False

    @property
    def tag(self) -> str:
        return self.iso639_tag.replace("_", "-").lower()

    @property
    def switcher_label(self) -> str:
        if self.translated_name:
            return f"{self.name} | {self.translated_name}"
        return self.name


@dataclasses.dataclass(frozen=True, kw_only=True, slots=True)
class BuildMetadata:
    _version: Version
    _language: Language

    @property
    def sphinxopts(self) -> Sequence[str]:
        return self._language.sphinxopts

    @property
    def iso639_tag(self) -> str:
        return self._language.iso639_tag

    @property
    def html_only(self) -> bool:
        return self._language.html_only or not self._language.in_prod

    @property
    def url(self):
        """The URL of this version in production."""
        if self.is_translation:
            return f"https://docs.python.org/{self.version}/{self.language}/"
        return f"https://docs.python.org/{self.version}/"

    @property
    def branch_or_tag(self) -> str:
        return self._version.branch_or_tag

    @property
    def status(self) -> str:
        return self._version.status

    @property
    def is_eol(self) -> bool:
        return self._version.status == "EOL"

    @property
    def dependencies(self) -> list[str]:
        return self._version.requirements

    @property
    def version(self):
        return self._version.name

    @property
    def version_tuple(self):
        return self._version.as_tuple()

    @property
    def language(self):
        return self._language.tag

    @property
    def is_translation(self):
        return self.language != "en"

    @property
    def slug(self) -> str:
        return f"{self.language}/{self.version}"

    @property
    def venv_name(self) -> str:
        return f"venv-{self.version}"

    @property
    def locale_repo_url(self) -> str:
        return f"https://github.com/python/python-docs-{self.language}.git"


def run(
    cmd: Sequence[str | Path], cwd: Path | None = None
) -> subprocess.CompletedProcess:
    """Like subprocess.run, with logging before and after the command execution."""
    cmd = list(map(str, cmd))
    cmdstring = shlex.join(cmd)
    logging.debug("Run: '%s'", cmdstring)
    result = subprocess.run(
        cmd,
        cwd=cwd,
        stdin=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        stdout=subprocess.PIPE,
        encoding="utf-8",
        errors="backslashreplace",
        check=False,
    )
    if result.returncode:
        # Log last 20 lines, those are likely the interesting ones.
        logging.error(
            "Run: '%s' KO:\n%s",
            cmdstring,
            "\n".join(f"    {line}" for line in result.stdout.split("\n")[-20:]),
        )
    result.check_returncode()
    return result


def run_with_logging(cmd: Sequence[str | Path], cwd: Path | None = None) -> None:
    """Like subprocess.check_call, with logging before the command execution."""
    cmd = list(map(str, cmd))
    logging.debug("Run: '%s'", shlex.join(cmd))
    with subprocess.Popen(
        cmd,
        cwd=cwd,
        stdin=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        stdout=subprocess.PIPE,
        encoding="utf-8",
    ) as p:
        try:
            for line in p.stdout or ():
                logging.debug(">>>> %s", line.rstrip())
        except:
            p.kill()
            raise
    if return_code := p.poll():
        raise subprocess.CalledProcessError(return_code, cmd[0])


def changed_files(left: Path, right: Path) -> int:
    """Compute the number of different files in the two directory trees."""

    def traverse(dircmp_result: filecmp.dircmp) -> int:
        changed = len(dircmp_result.diff_files)
        changed += sum(map(traverse, dircmp_result.subdirs.values()))
        return changed

    return traverse(filecmp.dircmp(left, right))


@dataclasses.dataclass
class Repository:
    """Git repository abstraction for our specific needs."""

    remote: str
    directory: Path

    def run(self, *args: str) -> subprocess.CompletedProcess:
        """Run git command in the clone repository."""
        return run(("git", "-C", self.directory) + args)

    def get_ref(self, pattern: str) -> str:
        """Return the reference of a given tag or branch."""
        try:
            # Maybe it's a branch
            return self.run("show-ref", "-s", f"origin/{pattern}").stdout.strip()
        except subprocess.CalledProcessError:
            # Maybe it's a tag
            return self.run("show-ref", "-s", f"tags/{pattern}").stdout.strip()

    def fetch(self) -> subprocess.CompletedProcess:
        """Try (and retry) to run git fetch."""
        try:
            return self.run("fetch")
        except subprocess.CalledProcessError as err:
            logging.error("'git fetch' failed (%s), retrying...", err.stderr)
            sleep(5)
        return self.run("fetch")

    def switch(self, branch_or_tag: str) -> None:
        """Reset and cleans the repository to the given branch or tag."""
        self.run("reset", "--hard", self.get_ref(branch_or_tag), "--")
        self.run("clean", "-dfqx")

    def clone(self) -> bool:
        """Maybe clone the repository, if not already cloned."""
        if (self.directory / ".git").is_dir():
            return False  # Already cloned
        logging.info("Cloning %s into %s", self.remote, self.directory)
        self.directory.mkdir(mode=0o775, parents=True, exist_ok=True)
        run(("git", "clone", self.remote, self.directory))
        return True

    def update(self) -> None:
        self.clone() or self.fetch()


def version_to_tuple(version: str) -> tuple[int, ...]:
    """Transform a version string to a tuple, for easy comparisons."""
    return tuple(int(part) for part in version.split("."))


def tuple_to_version(version_tuple: tuple[int, ...]) -> str:
    """Reverse version_to_tuple."""
    return ".".join(str(part) for part in version_tuple)


def locate_nearest_version(
    available_versions: Collection[str], target_version: str
) -> str:
    """Look for the nearest version of target_version in available_versions.
    Versions are to be given as tuples, like (3, 7) for 3.7.

    >>> locate_nearest_version(["2.7", "3.6", "3.7", "3.8"], "3.9")
    '3.8'
    >>> locate_nearest_version(["2.7", "3.6", "3.7", "3.8"], "3.5")
    '3.6'
    >>> locate_nearest_version(["2.7", "3.6", "3.7", "3.8"], "2.6")
    '2.7'
    >>> locate_nearest_version(["2.7", "3.6", "3.7", "3.8"], "3.10")
    '3.8'
    >>> locate_nearest_version(["2.7", "3.6", "3.7", "3.8"], "3.7")
    '3.7'
    """

    available_versions_tuples = sorted(map(version_to_tuple, set(available_versions)))
    target_version_tuple = version_to_tuple(target_version)
    try:
        found = available_versions_tuples[
            bisect(available_versions_tuples, target_version_tuple)
        ]
    except IndexError:
        found = available_versions_tuples[-1]
    return tuple_to_version(found)


@contextmanager
def edit(file: Path):
    """Context manager to edit a file "in place", use it as:

    with edit("/etc/hosts") as (i, o):
        for line in i:
            o.write(line.replace("localhoat", "localhost"))
    """
    temporary = file.with_name(file.name + ".tmp")
    with suppress(FileNotFoundError):
        temporary.unlink()
    with open(file, encoding="UTF-8") as input_file:
        with open(temporary, "w", encoding="UTF-8") as output_file:
            yield input_file, output_file
    temporary.rename(file)


def setup_switchers(script_content: bytes, html_root: Path) -> None:
    """Setup cross-links between CPython versions:
    - Cross-link various languages in a language switcher
    - Cross-link various versions in a version switcher
    """
    switchers_path = html_root / "_static" / "switchers.js"
    switchers_path.write_bytes(script_content)

    for file in html_root.glob("**/*.html"):
        depth = len(file.relative_to(html_root).parts) - 1
        src = f"{'../' * depth}_static/switchers.js"
        script = f'    <script type="text/javascript" src="{src}"></script>\n'
        with edit(file) as (ifile, ofile):
            for line in ifile:
                if line == script:
                    continue
                if line == "  </body>\n":
                    ofile.write(script)
                ofile.write(line)


def head(text: str, lines: int = 10) -> str:
    """Return the first *lines* lines from the given text."""
    return "\n".join(text.split("\n")[:lines])


def version_info() -> None:
    """Handler for --version."""
    try:
        platex_version = head(
            subprocess.check_output(("platex", "--version"), text=True),
            lines=3,
        )
    except FileNotFoundError:
        platex_version = "Not installed."

    try:
        xelatex_version = head(
            subprocess.check_output(("xelatex", "--version"), text=True),
            lines=2,
        )
    except FileNotFoundError:
        xelatex_version = "Not installed."
    print(
        f"""
# platex

{platex_version}


# xelatex

{xelatex_version}
    """
    )


@dataclasses.dataclass
class DocBuilder:
    """Builder for a CPython version and a language."""

    build_meta: BuildMetadata
    cpython_repo: Repository
    docs_by_version_content: bytes
    switchers_content: bytes
    build_root: Path
    www_root: Path
    select_output: Literal["no-html", "only-html", "only-html-en"] | None
    quick: bool
    group: str
    log_directory: Path
    skip_cache_invalidation: bool
    theme: str

    @property
    def html_only(self) -> bool:
        return (
            self.select_output in {"only-html", "only-html-en"}
            or self.quick
            or self.build_meta.html_only
        )

    @property
    def includes_html(self) -> bool:
        """Does the build we are running include HTML output?"""
        return self.select_output != "no-html"

    def run(self, http: urllib3.PoolManager, force_build: bool) -> bool | None:
        """Build and publish a Python doc, for a language, and a version."""
        start_time = perf_counter()
        start_timestamp = dt.datetime.now(tz=dt.UTC).replace(microsecond=0)
        logging.info("Running.")
        try:
            if self.build_meta.html_only and not self.includes_html:
                logging.info("Skipping non-HTML build (language is HTML-only).")
                return None  # skipped
            self.cpython_repo.switch(self.build_meta.branch_or_tag)
            if self.build_meta.is_translation:
                self.clone_translation()
            if trigger_reason := self.should_rebuild(force_build):
                self.build_venv()
                self.build()
                self.copy_build_to_webroot(http)
                self.save_state(
                    build_start=start_timestamp,
                    build_duration=perf_counter() - start_time,
                    trigger=trigger_reason,
                )
            else:
                return None  # skipped
        except Exception as err:
            logging.exception("Badly handled exception, human, please help.")
            if sentry_sdk:
                sentry_sdk.capture_exception(err)
            return False
        return True

    @property
    def locale_dir(self) -> Path:
        return self.build_root / self.build_meta.version / "locale"

    @property
    def checkout(self) -> Path:
        """Path to CPython git clone."""
        return self.build_root / _checkout_name(self.select_output)

    def clone_translation(self) -> None:
        self.translation_repo.update()
        self.translation_repo.switch(self.translation_branch)

    @property
    def translation_repo(self) -> Repository:
        """See PEP 545 for translations repository naming convention."""

        locale_clone_dir = self.locale_dir / self.build_meta.iso639_tag / "LC_MESSAGES"
        return Repository(self.build_meta.locale_repo_url, locale_clone_dir)

    @property
    def translation_branch(self) -> str:
        """Some CPython versions may be untranslated, being either too old or
        too new.

        This function looks for remote branches on the given repo, and
        returns the name of the nearest existing branch.

        It could be enhanced to also search for tags.
        """
        remote_branches = self.translation_repo.run("branch", "-r").stdout
        branches = re.findall(r"/([0-9]+\.[0-9]+)$", remote_branches, re.M)
        return locate_nearest_version(branches, self.build_meta.version)

    def build(self) -> None:
        """Build this version/language doc."""
        logging.info("Build start.")
        start_time = perf_counter()
        sphinxopts = list(self.build_meta.sphinxopts)
        if self.build_meta.is_translation:
            sphinxopts.extend((
                f"-D locale_dirs={self.locale_dir}",
                f"-D language={self.build_meta.iso639_tag}",
                "-D gettext_compact=0",
                "-D translation_progress_classes=1",
            ))

        if self.build_meta.is_eol:
            sphinxopts.append("-D html_context.outdated=1")

        if self.build_meta.status in ("in development", "pre-release"):
            maketarget = "autobuild-dev"
        else:
            maketarget = "autobuild-stable"
        if self.html_only:
            maketarget += "-html"
        logging.info("Running make %s", maketarget)
        python = self.venv / "bin" / "python"
        sphinxbuild = self.venv / "bin" / "sphinx-build"
        blurb = self.venv / "bin" / "blurb"

        if self.includes_html:
            site_url = self.build_meta.url
            # Define a tag to enable opengraph socialcards previews
            # (used in Doc/conf.py and requires matplotlib)
            sphinxopts += (
                "-t create-social-cards",
                f"-D ogp_site_url={site_url}",
            )

            if self.build_meta.version_tuple < (3, 8):
                # Disable CPython switchers, we handle them now:
                text = (self.checkout / "Doc" / "Makefile").read_text(encoding="utf-8")
                text = text.replace(" -A switchers=1", "")
                (self.checkout / "Doc" / "Makefile").write_text(text, encoding="utf-8")

            self.setup_indexsidebar()
        run_with_logging((
            "make",
            "-C",
            self.checkout / "Doc",
            f"PYTHON={python}",
            f"SPHINXBUILD={sphinxbuild}",
            f"BLURB={blurb}",
            f"VENVDIR={self.venv}",
            f"SPHINXOPTS={' '.join(sphinxopts)}",
            "SPHINXERRORHANDLING=",
            maketarget,
        ))
        self.log_directory.mkdir(parents=True, exist_ok=True)
        chgrp(self.log_directory, group=self.group, recursive=True)
        if self.includes_html:
            setup_switchers(
                self.switchers_content, self.checkout / "Doc" / "build" / "html"
            )
        logging.info("Build done (%s).", format_seconds(perf_counter() - start_time))

    def build_venv(self) -> None:
        """Build a venv for the specific Python version.

        So we can reuse them from builds to builds, while they contain
        different Sphinx versions.
        """
        requirements = list(self.build_meta.dependencies)
        if self.includes_html:
            # opengraph previews
            requirements.append("matplotlib>=3")

        venv_path = self.build_root / self.build_meta.venv_name
        venv.create(venv_path, symlinks=os.name != "nt", with_pip=True)
        run(
            (
                venv_path / "bin" / "python",
                "-m",
                "pip",
                "install",
                "--upgrade",
                "--upgrade-strategy=eager",
                self.theme,
                *requirements,
            ),
            cwd=self.checkout / "Doc",
        )
        run((venv_path / "bin" / "python", "-m", "pip", "freeze", "--all"))
        self.venv = venv_path

    def setup_indexsidebar(self) -> None:
        """Copy indexsidebar.html for Sphinx."""
        tmpl_src = HERE / "templates"
        tmpl_dst = self.checkout / "Doc" / "tools" / "templates"
        dbv_path = tmpl_dst / "_docs_by_version.html"

        shutil.copy(tmpl_src / "indexsidebar.html", tmpl_dst / "indexsidebar.html")
        if not self.build_meta.is_eol:
            dbv_path.write_bytes(self.docs_by_version_content)
        else:
            shutil.copy(tmpl_src / "_docs_by_version.html", dbv_path)

    def copy_build_to_webroot(self, http: urllib3.PoolManager) -> None:
        """Copy a given build to the appropriate webroot with appropriate rights."""
        logging.info("Publishing start.")
        start_time = perf_counter()
        self.www_root.mkdir(parents=True, exist_ok=True)
        if not self.build_meta.is_translation:
            target = self.www_root / self.build_meta.version
        else:
            language_dir = self.www_root / self.build_meta.language
            language_dir.mkdir(parents=True, exist_ok=True)
            chgrp(language_dir, group=self.group, recursive=True)
            language_dir.chmod(0o775)
            target = language_dir / self.build_meta.version

        target.mkdir(parents=True, exist_ok=True)
        try:
            target.chmod(0o775)
        except PermissionError as err:
            logging.warning("Can't change mod of %s: %s", target, str(err))
        chgrp(target, group=self.group, recursive=True)

        changed = 0
        if self.includes_html:
            # Copy built HTML files to webroot (default /srv/docs.python.org)
            changed += changed_files(self.checkout / "Doc" / "build" / "html", target)
            logging.info("Copying HTML files to %s", target)
            chgrp(
                self.checkout / "Doc" / "build" / "html/",
                group=self.group,
                recursive=True,
            )
            chmod_make_readable(self.checkout / "Doc" / "build" / "html")
            run((
                "rsync",
                "-a",
                "--delete-delay",
                "--filter",
                "P archives/",
                str(self.checkout / "Doc" / "build" / "html") + "/",
                target,
            ))

        dist_dir = self.checkout / "Doc" / "dist"
        if dist_dir.is_dir():
            # Copy archive files to /archives/
            logging.debug("Copying dist files.")
            chgrp(dist_dir, group=self.group, recursive=True)
            chmod_make_readable(dist_dir)
            archives_dir = target / "archives"
            archives_dir.mkdir(parents=True, exist_ok=True)
            archives_dir.chmod(
                archives_dir.stat().st_mode | stat.S_IROTH | stat.S_IXOTH
            )
            chgrp(archives_dir, group=self.group)
            changed += 1
            for dist_file in dist_dir.iterdir():
                shutil.copy2(dist_file, archives_dir / dist_file.name)
                changed += 1

        logging.info("%s files changed", changed)
        if changed and not self.skip_cache_invalidation:
            purge_surrogate_key(http, self.build_meta.slug)
        logging.info(
            "Publishing done (%s).", format_seconds(perf_counter() - start_time)
        )

    def should_rebuild(self, force: bool) -> str | Literal[False]:
        state = self.load_state()
        if not state:
            logging.info("Should rebuild: no previous state found.")
            return "no previous state"
        cpython_sha = self.cpython_repo.run("rev-parse", "HEAD").stdout.strip()
        if self.build_meta.is_translation:
            translation_sha = self.translation_repo.run(
                "rev-parse", "HEAD"
            ).stdout.strip()
            if translation_sha != state["translation_sha"]:
                logging.info(
                    "Should rebuild: new translations (from %s to %s)",
                    state["translation_sha"],
                    translation_sha,
                )
                return "new translations"
        if cpython_sha != state["cpython_sha"]:
            diff = self.cpython_repo.run(
                "diff", "--name-only", state["cpython_sha"], cpython_sha
            ).stdout
            if "Doc/" in diff or "Misc/NEWS.d/" in diff:
                logging.info(
                    "Should rebuild: Doc/ has changed (from %s to %s)",
                    state["cpython_sha"],
                    cpython_sha,
                )
                return "Doc/ has changed"
        if force:
            logging.info("Should rebuild: forced.")
            return "forced"
        logging.info("Nothing changed, no rebuild needed.")
        return False

    def load_state(self) -> dict:
        if self.select_output is not None:
            state_file = self.build_root / f"state-{self.select_output}.toml"
        else:
            state_file = self.build_root / "state.toml"
        try:
            return tomlkit.loads(state_file.read_text(encoding="UTF-8"))[
                f"/{self.build_meta.slug}/"
            ]
        except (KeyError, FileNotFoundError):
            return {}

    def save_state(
        self, build_start: dt.datetime, build_duration: float, trigger: str
    ) -> None:
        """Save current CPython sha1 and current translation sha1.

        Using this we can deduce if a rebuild is needed or not.
        """
        if self.select_output is not None:
            state_file = self.build_root / f"state-{self.select_output}.toml"
        else:
            state_file = self.build_root / "state.toml"
        try:
            states = tomlkit.parse(state_file.read_text(encoding="UTF-8"))
        except FileNotFoundError:
            states = tomlkit.document()

        key = f"/{self.build_meta.slug}/"
        state = {
            "last_build_start": build_start,
            "last_build_duration": round(build_duration, 0),
            "triggered_by": trigger,
            "cpython_sha": self.cpython_repo.run("rev-parse", "HEAD").stdout.strip(),
        }
        if self.build_meta.is_translation:
            state["translation_sha"] = self.translation_repo.run(
                "rev-parse", "HEAD"
            ).stdout.strip()
        states[key] = state
        state_file.write_text(tomlkit.dumps(states), encoding="UTF-8")

        table = tomlkit.inline_table()
        table |= state
        logging.info("Saved new rebuild state for %s: %s", key, table.as_string())


def chgrp(
    path: Path,
    /,
    group: int | str | None,
    *,
    recursive: bool = False,
    follow_symlinks: bool = True,
) -> None:
    if sys.platform == "win32":
        return

    from grp import getgrnam

    try:
        try:
            group_id = int(group)
        except ValueError:
            group_id = getgrnam(group)[2]
    except (LookupError, TypeError, ValueError):
        return

    try:
        os.chown(path, -1, group_id, follow_symlinks=follow_symlinks)
        if recursive:
            for p in path.rglob("*"):
                os.chown(p, -1, group_id, follow_symlinks=follow_symlinks)
    except OSError as err:
        logging.warning("Can't change group of %s: %s", path, str(err))


def chmod_make_readable(path: Path, /, mode: int = stat.S_IROTH) -> None:
    if not path.is_dir():
        raise ValueError

    path.chmod(path.stat().st_mode | stat.S_IROTH | stat.S_IXOTH)  # o+rx
    for p in path.rglob("*"):
        if p.is_dir():
            p.chmod(p.stat().st_mode | stat.S_IROTH | stat.S_IXOTH)  # o+rx
        else:
            p.chmod(p.stat().st_mode | stat.S_IROTH)  # o+r


def format_seconds(seconds: float) -> str:
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    hours, minutes, seconds = int(hours), int(minutes), round(seconds)

    match (hours, minutes, seconds):
        case 0, 0, s:
            return f"{s}s"
        case 0, m, s:
            return f"{m}m {s}s"
        case h, m, s:
            return f"{h}h {m}m {s}s"

    raise ValueError("unreachable")


def _checkout_name(select_output: str | None) -> str:
    if select_output is not None:
        return f"cpython-{select_output}"
    return "cpython"


def main() -> int:
    """Script entry point."""
    args = parse_args()
    setup_logging(args.log_directory, args.select_output)
    load_environment_variables()

    if args.select_output is None:
        return build_docs_with_lock(args, "build_docs.lock")
    if args.select_output == "no-html":
        return build_docs_with_lock(args, "build_docs_archives.lock")
    if args.select_output == "only-html":
        return build_docs_with_lock(args, "build_docs_html.lock")
    if args.select_output == "only-html-en":
        return build_docs_with_lock(args, "build_docs_html_en.lock")
    return EX_FAILURE


def parse_args() -> argparse.Namespace:
    """Parse command-line arguments."""

    parser = argparse.ArgumentParser(
        description="Runs a build of the Python docs for various branches.",
        allow_abbrev=False,
    )
    parser.suggest_on_error = True
    parser.add_argument(
        "--select-output",
        choices=("no-html", "only-html", "only-html-en"),
        help="Choose what outputs to build.",
    )
    parser.add_argument(
        "-q",
        "--quick",
        action="store_true",
        help="Run a quick build (only HTML files).",
    )
    parser.add_argument(
        "-b",
        "--branches",
        nargs="*",
        metavar="3.12",
        help="Versions to build (defaults to all maintained branches).",
    )
    parser.add_argument(
        "-r",
        "--build-root",
        type=Path,
        help="Path to a directory containing a checkout per branch.",
        default=Path("/srv/docsbuild"),
    )
    parser.add_argument(
        "-w",
        "--www-root",
        type=Path,
        help="Path where generated files will be copied.",
        default=Path("/srv/docs.python.org"),
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Always build the chosen languages and versions, "
        "regardless of existing state.",
    )
    parser.add_argument(
        "--skip-cache-invalidation",
        help="Skip Fastly cache invalidation.",
        action="store_true",
    )
    parser.add_argument(
        "--group",
        help="Group files on targets and www-root file should get.",
        default="docs",
    )
    parser.add_argument(
        "--log-directory",
        type=Path,
        help="Directory used to store logs.",
        default=Path("/var/log/docsbuild/"),
    )
    parser.add_argument(
        "--languages",
        nargs="*",
        help="Language translation, as a PEP 545 language tag like"
        " 'fr' or 'pt-br'. "
        "Builds all available languages by default.",
        metavar="fr",
    )
    parser.add_argument(
        "--version",
        action="store_true",
        help="Get build_docs and dependencies version info",
    )
    parser.add_argument(
        "--theme",
        default="python-docs-theme",
        help="Python package to use for python-docs-theme: Useful to test branches:"
        " --theme git+https://github.com/obulat/python-docs-theme@master",
    )
    args = parser.parse_args()
    if args.version:
        version_info()
        sys.exit(0)
    del args.version
    if args.log_directory:
        args.log_directory = args.log_directory.resolve()
    if args.build_root:
        args.build_root = args.build_root.resolve()
    if args.www_root:
        args.www_root = args.www_root.resolve()
    return args


def setup_logging(log_directory: Path, select_output: str | None) -> None:
    """Setup logging to stderr if run by a human, or to a file if run from a cron."""
    log_format = "%(asctime)s %(levelname)s: %(message)s"
    if sys.stderr.isatty() or "CI" in os.environ:
        logging.basicConfig(format=log_format, stream=sys.stderr)
    else:
        log_directory.mkdir(parents=True, exist_ok=True)
        if select_output is None:
            filename = log_directory / "docsbuild.log"
        else:
            filename = log_directory / f"docsbuild-{select_output}.log"
        handler = logging.handlers.WatchedFileHandler(filename)
        handler.setFormatter(logging.Formatter(log_format))
        logging.getLogger().addHandler(handler)
    logging.getLogger().setLevel(logging.DEBUG)


def load_environment_variables() -> None:
    dbs_user_config = platformdirs.user_config_path("docsbuild-scripts")
    dbs_site_config = platformdirs.site_config_path("docsbuild-scripts")
    if dbs_user_config.is_file():
        env_conf_file = dbs_user_config
    elif dbs_site_config.is_file():
        env_conf_file = dbs_site_config
    else:
        logging.info(
            "No environment variables configured. Configure in %s or %s.",
            dbs_site_config,
            dbs_user_config,
        )
        return

    logging.info("Reading environment variables from %s.", env_conf_file)
    if env_conf_file == dbs_site_config:
        logging.info("You can override settings in %s.", dbs_user_config)
    elif dbs_site_config.is_file():
        logging.info("Overriding %s.", dbs_site_config)

    env_config = env_conf_file.read_text(encoding="utf-8")
    for key, value in tomlkit.parse(env_config).get("env", {}).items():
        logging.debug("Setting %s in environment.", key)
        os.environ[key] = value


def build_docs_with_lock(args: argparse.Namespace, lockfile_name: str) -> int:
    try:
        lock = zc.lockfile.LockFile(HERE / lockfile_name)
    except zc.lockfile.LockError:
        logging.info("Another builder is running... dying...")
        return EX_FAILURE

    try:
        return build_docs(args)
    finally:
        lock.close()


def build_docs(args: argparse.Namespace) -> int:
    """Build all docs (each language and each version)."""
    logging.info("Full build start.")
    start_time = perf_counter()
    http = urllib3.PoolManager()
    versions = parse_versions_from_devguide(http)
    languages = parse_languages_from_config()
    # Reverse languages but not versions, because we take version-language
    # pairs from the end of the list, effectively reversing it.
    # This runs languages in config.toml order and versions newest first.
    todo = [
        BuildMetadata(_version=version, _language=language)
        for version in versions.filter(args.branches)
        for language in reversed(languages.filter(args.languages))
    ]
    del args.branches
    del args.languages
    force_build = args.force
    del args.force

    docs_by_version_content = render_docs_by_version(versions).encode()
    switchers_content = render_switchers(versions, languages)

    build_succeeded = set()
    any_build_failed = False
    cpython_repo = Repository(
        "https://github.com/python/cpython.git",
        args.build_root / _checkout_name(args.select_output),
    )
    while todo:
        build_props = todo.pop()
        logging.root.handlers[0].setFormatter(
            logging.Formatter(
                f"%(asctime)s %(levelname)s {build_props.slug}: %(message)s"
            )
        )
        if sentry_sdk:
            scope = sentry_sdk.get_isolation_scope()
            scope.set_tag("version", build_props.version)
            scope.set_tag("language", build_props.language)
        cpython_repo.update()
        builder = DocBuilder(
            build_props,
            cpython_repo,
            docs_by_version_content,
            switchers_content,
            **vars(args),
        )
        built_successfully = builder.run(http, force_build=force_build)
        if built_successfully:
            build_succeeded.add(build_props.slug)
        elif built_successfully is not None:
            any_build_failed = True

    logging.root.handlers[0].setFormatter(
        logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
    )

    build_sitemap(versions, languages, args.www_root, args.group)
    build_404(args.www_root, args.group)
    copy_robots_txt(
        args.www_root,
        args.group,
        args.skip_cache_invalidation,
        http,
    )
    make_symlinks(
        args.www_root,
        args.group,
        versions,
        languages,
        build_succeeded,
        args.skip_cache_invalidation,
        http,
    )
    if build_succeeded:
        # Only check canonicals if at least one version was built.
        proofread_canonicals(args.www_root, args.skip_cache_invalidation, http)

    logging.info("Full build done (%s).", format_seconds(perf_counter() - start_time))

    return EX_FAILURE if any_build_failed else EX_OK


def parse_versions_from_devguide(http: urllib3.PoolManager) -> Versions:
    releases = http.request(
        "GET",
        "https://raw.githubusercontent.com/"
        "python/devguide/main/include/release-cycle.json",
        timeout=30,
    ).json()
    return Versions.from_json(releases)


def parse_languages_from_config() -> Languages:
    """Read config.toml to discover languages to build."""
    config = tomlkit.parse((HERE / "config.toml").read_text(encoding="UTF-8"))
    return Languages.from_json(config["defaults"], config["languages"])


def render_docs_by_version(versions: Versions) -> str:
    """Generate content for _docs_by_version.html."""
    links = [f'<li><a href="{v.url}">{v.title}</a></li>' for v in reversed(versions)]
    return "\n".join(links)


def render_switchers(versions: Versions, languages: Languages) -> bytes:
    language_pairs = sorted((l.tag, l.switcher_label) for l in languages if l.in_prod)  # NoQA: E741
    version_pairs = [(v.name, v.picker_label) for v in reversed(versions)]

    switchers_template_file = HERE / "templates" / "switchers.js"
    template = Template(switchers_template_file.read_text(encoding="UTF-8"))
    rendered_template = template.safe_substitute(
        LANGUAGES=json.dumps(language_pairs),
        VERSIONS=json.dumps(version_pairs),
    )
    return rendered_template.encode("UTF-8")


def build_sitemap(
    versions: Versions, languages: Languages, www_root: Path, group: str
) -> None:
    """Build a sitemap with all live versions and translations."""
    if not www_root.exists():
        logging.info("Skipping sitemap generation (www root does not even exist).")
        return
    logging.info("Starting sitemap generation...")
    template_path = HERE / "templates" / "sitemap.xml"
    template = jinja2.Template(template_path.read_text(encoding="UTF-8"))
    rendered_template = template.render(languages=languages, versions=versions)
    sitemap_path = www_root / "sitemap.xml"
    sitemap_path.write_text(rendered_template + "\n", encoding="UTF-8")
    sitemap_path.chmod(0o664)
    chgrp(sitemap_path, group=group)


def build_404(www_root: Path, group: str) -> None:
    """Build a nice 404 error page to display in case PDFs are not built yet."""
    if not www_root.exists():
        logging.info("Skipping 404 page generation (www root does not even exist).")
        return
    logging.info("Copying 404 page...")
    not_found_file = www_root / "404.html"
    shutil.copyfile(HERE / "templates" / "404.html", not_found_file)
    not_found_file.chmod(0o664)
    chgrp(not_found_file, group=group)


def copy_robots_txt(
    www_root: Path,
    group: str,
    skip_cache_invalidation: bool,
    http: urllib3.PoolManager,
) -> None:
    """Copy robots.txt to www_root."""
    if not www_root.exists():
        logging.info("Skipping copying robots.txt (www root does not even exist).")
        return
    logging.info("Copying robots.txt...")
    template_path = HERE / "templates" / "robots.txt"
    robots_path = www_root / "robots.txt"
    shutil.copyfile(template_path, robots_path)
    robots_path.chmod(0o775)
    chgrp(robots_path, group=group)
    if not skip_cache_invalidation:
        purge(http, "robots.txt")


def make_symlinks(
    www_root: Path,
    group: str,
    versions: Versions,
    languages: Languages,
    successful_builds: Set[str],
    skip_cache_invalidation: bool,
    http: urllib3.PoolManager,
) -> None:
    """Maintains the /2/, /3/, and /dev/ symlinks for each language.

    Like:
    - /2/ → /2.7/
    - /3/ → /3.12/
    - /dev/ → /3.14/
    - /fr/3/ → /fr/3.12/
    - /es/dev/ → /es/3.14/
    """
    logging.info("Creating major and development version symlinks...")
    for symlink_name, symlink_target in (
        ("3", versions.current_stable.name),
        ("2", "2.7"),
        ("dev", versions.current_dev.name),
    ):
        for language in languages:
            if f"{language.tag}/{symlink_target}" in successful_builds:
                symlink(
                    www_root,
                    language.tag,
                    symlink_target,
                    symlink_name,
                    group,
                    skip_cache_invalidation,
                    http,
                )


def symlink(
    www_root: Path,
    language_tag: str,
    directory: str,
    name: str,
    group: str,
    skip_cache_invalidation: bool,
    http: urllib3.PoolManager,
) -> None:
    """Used by major_symlinks and dev_symlink to maintain symlinks."""
    msg = "Creating symlink from /%s/ to /%s/"
    if language_tag == "en":  # English is rooted on /, no /en/
        path = www_root
        logging.debug(msg, name, directory)
    else:
        path = www_root / language_tag
        logging.debug(msg, f"{language_tag}/{name}", f"{language_tag}/{directory}")
    link = path / name
    directory_path = path / directory
    if not directory_path.exists():
        return  # No touching link, dest doc not built yet.

    if not link.exists() or os.readlink(link) != directory:
        # Link does not exist or points to the wrong target.
        link.unlink(missing_ok=True)
        link.symlink_to(directory)
        chgrp(link, group=group, follow_symlinks=False)
    if not skip_cache_invalidation:
        surrogate_key = f"{language_tag}/{name}"
        purge_surrogate_key(http, surrogate_key)


def proofread_canonicals(
    www_root: Path, skip_cache_invalidation: bool, http: urllib3.PoolManager
) -> None:
    """In www_root we check that all canonical links point to existing contents.

    It can happen that a canonical is "broken":

    - /3.11/whatsnew/3.11.html typically would link to
    /3/whatsnew/3.11.html, which may not exist yet.
    """
    logging.info("Checking canonical links...")
    worker_count = (os.cpu_count() or 1) + 2
    with concurrent.futures.ThreadPoolExecutor(worker_count) as executor:
        futures = {
            executor.submit(_check_canonical_rel, file, www_root)
            for file in www_root.glob("**/*.html")
        }
        paths_to_purge = {
            res.relative_to(www_root)  # strip the leading /srv/docs.python.org
            for fut in concurrent.futures.as_completed(futures)
            if (res := fut.result()) is not None
        }
    if not skip_cache_invalidation:
        purge(http, *paths_to_purge)


# Python 3.12 onwards doesn't use self-closing tags for <link rel="canonical">
_canonical_re = re.compile(
    b"""<link rel="canonical" href="https://docs.python.org/([^"]*)"(?: /)?>"""
)


def _check_canonical_rel(file: Path, www_root: Path) -> Path | None:
    # Check for a canonical relation link in the HTML.
    # If one exists, ensure that the target exists
    # or otherwise remove the canonical link element.
    html = file.read_bytes()
    canonical = _canonical_re.search(html)
    if canonical is None:
        return None
    target = canonical[1].decode(encoding="UTF-8", errors="surrogateescape")
    if (www_root / target).exists():
        return None
    logging.info("Removing broken canonical from %s to %s", file, target)
    start, end = canonical.span()
    file.write_bytes(html[:start] + html[end:])
    return file


def purge(http: urllib3.PoolManager, *paths: Path | str) -> None:
    """Remove one or many paths from docs.python.org's CDN.

    To be used when a file changes, so the CDN fetches the new one.
    """
    base = "https://docs.python.org/"
    for path in paths:
        url = urljoin(base, str(path))
        logging.debug("Purging %s from CDN", url)
        http.request("PURGE", url, timeout=30)


def purge_surrogate_key(http: urllib3.PoolManager, surrogate_key: str) -> None:
    """Remove paths from docs.python.org's CDN.

    All paths matching the given 'Surrogate-Key' will be removed.
    This is set by the Nginx server for every language-version pair.
    To be used when a directory changes, so the CDN fetches the new one.

    https://www.fastly.com/documentation/reference/api/purging/#purge-tag
    """
    unset = "__UNSET__"
    service_id = os.environ.get("FASTLY_SERVICE_ID", unset)
    fastly_key = os.environ.get("FASTLY_TOKEN", unset)

    if service_id == unset or fastly_key == unset:
        logging.info("CDN secrets not set, skipping Surrogate-Key purge")
        return

    logging.info("Purging Surrogate-Key '%s' from CDN", surrogate_key)
    http.request(
        "POST",
        f"https://api.fastly.com/service/{service_id}/purge/{surrogate_key}",
        headers={"Fastly-Key": fastly_key},
        timeout=30,
    )


if __name__ == "__main__":
    raise SystemExit(main())