[Python-checkins] [3.12] gh-101100: Only show GitHub check annotations on changed doc paragraphs (GH-108065) (#108127)

Yhg1s webhook-mailer at python.org
Sat Aug 19 19:00:13 EDT 2023


https://github.com/python/cpython/commit/71e3581c969e8a7dbfd21db2407af498fd0027e3
commit: 71e3581c969e8a7dbfd21db2407af498fd0027e3
branch: 3.12
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: Yhg1s <thomas at python.org>
date: 2023-08-20T01:00:09+02:00
summary:

[3.12] gh-101100: Only show GitHub check annotations on changed doc paragraphs (GH-108065) (#108127)

gh-101100: Only show GitHub check annotations on changed doc paragraphs (GH-108065)

* Only show GitHub check annotations on changed doc paragraphs
* Improve check-warnings script arg parsing following Hugo's suggestions
* Factor filtering warnings by modified diffs into helper function
* Build docs on unmerged branch so warning lines match & avoid deep clone

---------

(cherry picked from commit eb953d6e4484339067837020f77eecac61f8d4f8)

Co-authored-by: C.A.M. Gerlach <CAM.Gerlach at Gerlach.CAM>
Co-authored-by: Hugo van Kemenade <hugovk at users.noreply.github.com>
Co-authored-by: Adam Turner <9087854+AA-Turner at users.noreply.github.com>

files:
M .github/workflows/reusable-docs.yml
M Doc/tools/check-warnings.py

diff --git a/.github/workflows/reusable-docs.yml b/.github/workflows/reusable-docs.yml
index 39e5ad62924ad..6150b1a7d416a 100644
--- a/.github/workflows/reusable-docs.yml
+++ b/.github/workflows/reusable-docs.yml
@@ -16,8 +16,30 @@ jobs:
     name: 'Docs'
     runs-on: ubuntu-latest
     timeout-minutes: 60
+    env:
+      branch_base: 'origin/${{ github.event.pull_request.base.ref }}'
+      branch_pr: 'origin/${{ github.event.pull_request.head.ref }}'
+      refspec_base: '+${{ github.event.pull_request.base.sha }}:remotes/origin/${{ github.event.pull_request.base.ref }}'
+      refspec_pr: '+${{ github.event.pull_request.head.sha }}:remotes/origin/${{ github.event.pull_request.head.ref }}'
     steps:
-    - uses: actions/checkout at v3
+    - name: 'Check out latest PR branch commit'
+      uses: actions/checkout at v3
+      with:
+        ref: ${{ github.event.pull_request.head.sha }}
+    # Adapted from https://github.com/actions/checkout/issues/520#issuecomment-1167205721
+    - name: 'Fetch commits to get branch diff'
+      run: |
+        # Fetch enough history to find a common ancestor commit (aka merge-base):
+        git fetch origin ${{ env.refspec_pr }} --depth=$(( ${{ github.event.pull_request.commits }} + 1 )) \
+          --no-tags --prune --no-recurse-submodules
+
+        # This should get the oldest commit in the local fetched history (which may not be the commit the PR branched from):
+        COMMON_ANCESTOR=$( git rev-list --first-parent --max-parents=0 --max-count=1 ${{ env.branch_pr }} )
+        DATE=$( git log --date=iso8601 --format=%cd "${COMMON_ANCESTOR}" )
+
+        # Get all commits since that commit date from the base branch (eg: master or main):
+        git fetch origin ${{ env.refspec_base }} --shallow-since="${DATE}" \
+          --no-tags --prune --no-recurse-submodules
     - name: 'Set up Python'
       uses: actions/setup-python at v4
       with:
@@ -28,13 +50,6 @@ jobs:
       run: make -C Doc/ venv
 
     # To annotate PRs with Sphinx nitpicks (missing references)
-    - name: 'Get list of changed files'
-      if: github.event_name == 'pull_request'
-      id: changed_files
-      uses: Ana06/get-changed-files at v2.2.0
-      with:
-        filter: "Doc/**"
-        format: csv  # works for paths with spaces
     - name: 'Build HTML documentation'
       continue-on-error: true
       run: |
@@ -45,7 +60,7 @@ jobs:
       if: github.event_name == 'pull_request'
       run: |
         python Doc/tools/check-warnings.py \
-          --check-and-annotate '${{ steps.changed_files.outputs.added_modified }}' \
+          --annotate-diff '${{ env.branch_base }}' '${{ env.branch_pr }}' \
           --fail-if-regression \
           --fail-if-improved
 
diff --git a/Doc/tools/check-warnings.py b/Doc/tools/check-warnings.py
index c17d0f51cd127..809a8d63087e1 100644
--- a/Doc/tools/check-warnings.py
+++ b/Doc/tools/check-warnings.py
@@ -2,12 +2,16 @@
 """
 Check the output of running Sphinx in nit-picky mode (missing references).
 """
+from __future__ import annotations
+
 import argparse
-import csv
+import itertools
 import os
 import re
+import subprocess
 import sys
 from pathlib import Path
+from typing import TextIO
 
 # Exclude these whether they're dirty or clean,
 # because they trigger a rebuild of dirty files.
@@ -24,28 +28,178 @@
     "venv",
 }
 
-PATTERN = re.compile(r"(?P<file>[^:]+):(?P<line>\d+): WARNING: (?P<msg>.+)")
+# Regex pattern to match the parts of a Sphinx warning
+WARNING_PATTERN = re.compile(
+    r"(?P<file>([A-Za-z]:[\\/])?[^:]+):(?P<line>\d+): WARNING: (?P<msg>.+)"
+)
+
+# Regex pattern to match the line numbers in a Git unified diff
+DIFF_PATTERN = re.compile(
+    r"^@@ -(?P<linea>\d+)(?:,(?P<removed>\d+))? \+(?P<lineb>\d+)(?:,(?P<added>\d+))? @@",
+    flags=re.MULTILINE,
+)
+
+
+def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]:
+    """List the files changed between two Git refs, filtered by change type."""
+    added_files_result = subprocess.run(
+        [
+            "git",
+            "diff",
+            f"--diff-filter={filter_mode}",
+            "--name-only",
+            f"{ref_a}...{ref_b}",
+            "--",
+        ],
+        stdout=subprocess.PIPE,
+        check=True,
+        text=True,
+        encoding="UTF-8",
+    )
+
+    added_files = added_files_result.stdout.strip().split("\n")
+    return {Path(file.strip()) for file in added_files if file.strip()}
+
+
+def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]:
+    """List the lines changed between two Git refs for a specific file."""
+    diff_output = subprocess.run(
+        [
+            "git",
+            "diff",
+            "--unified=0",
+            f"{ref_a}...{ref_b}",
+            "--",
+            str(file),
+        ],
+        stdout=subprocess.PIPE,
+        check=True,
+        text=True,
+        encoding="UTF-8",
+    )
+
+    # Scrape line offsets + lengths from diff and convert to line numbers
+    line_matches = DIFF_PATTERN.finditer(diff_output.stdout)
+    # Removed and added line counts are 1 if not printed
+    line_match_values = [
+        line_match.groupdict(default=1) for line_match in line_matches
+    ]
+    line_ints = [
+        (int(match_value["lineb"]), int(match_value["added"]))
+        for match_value in line_match_values
+    ]
+    line_ranges = [
+        range(line_b, line_b + added) for line_b, added in line_ints
+    ]
+    line_numbers = list(itertools.chain(*line_ranges))
+
+    return line_numbers
+
+
+def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]:
+    """Get the line numbers of text in a file object, grouped by paragraph."""
+    paragraphs = []
+    prev_line = None
+    for lineno, line in enumerate(file_obj):
+        lineno = lineno + 1
+        if prev_line is None or (line.strip() and not prev_line.strip()):
+            paragraph = [lineno - 1]
+            paragraphs.append(paragraph)
+        paragraph.append(lineno)
+        prev_line = line
+    return paragraphs
+
+
+def filter_and_parse_warnings(
+    warnings: list[str], files: set[Path]
+) -> list[re.Match[str]]:
+    """Get the warnings matching passed files and parse them with regex."""
+    filtered_warnings = [
+        warning
+        for warning in warnings
+        if any(str(file) in warning for file in files)
+    ]
+    warning_matches = [
+        WARNING_PATTERN.fullmatch(warning.strip())
+        for warning in filtered_warnings
+    ]
+    non_null_matches = [warning for warning in warning_matches if warning]
+    return non_null_matches
+
+
+def filter_warnings_by_diff(
+    warnings: list[re.Match[str]], ref_a: str, ref_b: str, file: Path
+) -> list[re.Match[str]]:
+    """Filter the passed per-file warnings to just those on changed lines."""
+    diff_lines = get_diff_lines(ref_a, ref_b, file)
+    with file.open(encoding="UTF-8") as file_obj:
+        paragraphs = get_para_line_numbers(file_obj)
+    touched_paras = [
+        para_lines
+        for para_lines in paragraphs
+        if set(diff_lines) & set(para_lines)
+    ]
+    touched_para_lines = set(itertools.chain(*touched_paras))
+    warnings_infile = [
+        warning for warning in warnings if str(file) in warning["file"]
+    ]
+    warnings_touched = [
+        warning
+        for warning in warnings_infile
+        if int(warning["line"]) in touched_para_lines
+    ]
+    return warnings_touched
+
 
+def process_touched_warnings(
+    warnings: list[str], ref_a: str, ref_b: str
+) -> list[re.Match[str]]:
+    """Filter a list of Sphinx warnings to those affecting touched lines."""
+    added_files, modified_files = tuple(
+        get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M")
+    )
+
+    warnings_added = filter_and_parse_warnings(warnings, added_files)
+    warnings_modified = filter_and_parse_warnings(warnings, modified_files)
+
+    modified_files_warned = {
+        file
+        for file in modified_files
+        if any(str(file) in warning["file"] for warning in warnings_modified)
+    }
 
-def check_and_annotate(warnings: list[str], files_to_check: str) -> None:
+    warnings_modified_touched = [
+        filter_warnings_by_diff(warnings_modified, ref_a, ref_b, file)
+        for file in modified_files_warned
+    ]
+    warnings_touched = warnings_added + list(
+        itertools.chain(*warnings_modified_touched)
+    )
+
+    return warnings_touched
+
+
+def annotate_diff(
+    warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD"
+) -> None:
     """
-    Convert Sphinx warning messages to GitHub Actions.
+    Convert Sphinx warning messages to GitHub Actions for changed paragraphs.
 
     Converts lines like:
         .../Doc/library/cgi.rst:98: WARNING: reference target not found
     to:
         ::warning file=.../Doc/library/cgi.rst,line=98::reference target not found
 
-    Non-matching lines are echoed unchanged.
-
-    see:
+    See:
     https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message
     """
-    files_to_check = next(csv.reader([files_to_check]))
-    for warning in warnings:
-        if any(filename in warning for filename in files_to_check):
-            if match := PATTERN.fullmatch(warning):
-                print("::warning file={file},line={line}::{msg}".format_map(match))
+    warnings_touched = process_touched_warnings(warnings, ref_a, ref_b)
+    print("Emitting doc warnings matching modified lines:")
+    for warning in warnings_touched:
+        print("::warning file={file},line={line}::{msg}".format_map(warning))
+        print(warning[0])
+    if not warnings_touched:
+        print("None")
 
 
 def fail_if_regression(
@@ -68,7 +222,7 @@ def fail_if_regression(
             print(filename)
             for warning in warnings:
                 if filename in warning:
-                    if match := PATTERN.fullmatch(warning):
+                    if match := WARNING_PATTERN.fullmatch(warning):
                         print("  {line}: {msg}".format_map(match))
         return -1
     return 0
@@ -91,12 +245,14 @@ def fail_if_improved(
     return 0
 
 
-def main() -> int:
+def main(argv: list[str] | None = None) -> int:
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--check-and-annotate",
-        help="Comma-separated list of files to check, "
-        "and annotate those with warnings on GitHub Actions",
+        "--annotate-diff",
+        nargs="*",
+        metavar=("BASE_REF", "HEAD_REF"),
+        help="Add GitHub Actions annotations on the diff for warnings on "
+        "lines changed between the given refs (main and HEAD, by default)",
     )
     parser.add_argument(
         "--fail-if-regression",
@@ -108,13 +264,19 @@ def main() -> int:
         action="store_true",
         help="Fail if new files with no nits are found",
     )
-    args = parser.parse_args()
+
+    args = parser.parse_args(argv)
+    if args.annotate_diff is not None and len(args.annotate_diff) > 2:
+        parser.error(
+            "--annotate-diff takes between 0 and 2 ref args, not "
+            f"{len(args.annotate_diff)} {tuple(args.annotate_diff)}"
+        )
     exit_code = 0
 
     wrong_directory_msg = "Must run this script from the repo root"
     assert Path("Doc").exists() and Path("Doc").is_dir(), wrong_directory_msg
 
-    with Path("Doc/sphinx-warnings.txt").open() as f:
+    with Path("Doc/sphinx-warnings.txt").open(encoding="UTF-8") as f:
         warnings = f.read().splitlines()
 
     cwd = str(Path.cwd()) + os.path.sep
@@ -124,15 +286,15 @@ def main() -> int:
         if "Doc/" in warning
     }
 
-    with Path("Doc/tools/.nitignore").open() as clean_files:
+    with Path("Doc/tools/.nitignore").open(encoding="UTF-8") as clean_files:
         files_with_expected_nits = {
             filename.strip()
             for filename in clean_files
             if filename.strip() and not filename.startswith("#")
         }
 
-    if args.check_and_annotate:
-        check_and_annotate(warnings, args.check_and_annotate)
+    if args.annotate_diff is not None:
+        annotate_diff(warnings, *args.annotate_diff)
 
     if args.fail_if_regression:
         exit_code += fail_if_regression(



More information about the Python-checkins mailing list