Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions src/dayamlchecker/check_questions_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,13 +287,28 @@ def is_absolute_http_url(url: str) -> bool:


def is_reserved_example_domain(url: str) -> bool:
"""Check if URL is in a reserved example domain (RFC 2606)."""
example_domains: frozenset[str] = frozenset(
{"example.com", "example.net", "example.org"}
"""Check if URL is in a non-registrable example/test domain.

RFC 2606 and RFC 6761 reserve the special-use names below. IANA also
operates example.edu as an example domain, even though it is not listed
in the RFC 6761 Special-Use Domain Names registry.
"""
rfc_special_use_domains: frozenset[str] = frozenset(
{
"example",
"example.com",
"example.net",
"example.org",
"invalid",
"localhost",
"test",
}
)
iana_managed_example_domains: frozenset[str] = frozenset({"example.edu"})
ignored_domains = rfc_special_use_domains | iana_managed_example_domains
hostname = (urlparse(url).hostname or "").lower()
return hostname in example_domains or any(
hostname.endswith(f".{domain}") for domain in example_domains
return hostname in ignored_domains or any(
hostname.endswith(f".{domain}") for domain in ignored_domains
)


Expand Down Expand Up @@ -348,8 +363,10 @@ def parse_url_token(raw_url: str) -> tuple[str | None, bool]:
if not url.startswith(("http://", "https://")):
return None, False

# Link extraction in YAML/JS text can include trailing punctuation.
url = url.rstrip(".,;:!?)>]}")
# Link extraction in YAML/JS text can include trailing punctuation or
# markup artifacts (like markdown bold asterisks or Docassemble variable
# prefix symbols).
url = url.rstrip(".,;:!?)>]}{*$")

# Query strings are valid. For concatenation checks, inspect only the
# URL part before '?' so embedded URLs in query parameters don't trigger
Expand Down
93 changes: 53 additions & 40 deletions tests/test_check_questions_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,30 @@
check_urls,
extract_text_from_pdf,
extract_urls_from_file,
parse_url_token,
)


def test_parse_url_token_normalizes_markdown_and_docassemble_artifacts() -> None:
# Markdown bold
assert parse_url_token("https://example.com**")[0] == "https://example.com"
# Docassemble variable prefixes
assert parse_url_token("https://github.com/$")[0] == "https://github.com/"
assert parse_url_token("https://github.com/${")[0] == "https://github.com/"
# Trailing braces
assert parse_url_token("https://example.com/}")[0] == "https://example.com/"
# Combined trailing artifacts
assert parse_url_token("https://example.com/}$")[0] == "https://example.com/"


def test_extract_urls_skips_python_comment_urls(tmp_path: Path) -> None:
file_path = tmp_path / "example.py"
file_path = tmp_path / "suffolklitlab.org.py"
file_path.write_text(
"\n".join(
[
"# https://commented.example/full-line",
'live = "https://live.example/value"',
"value = 1 # https://commented.example/trailing",
"# https://commented.suffolklitlab.org/full-line",
'live = "https://live.suffolklitlab.org/value"',
"value = 1 # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -30,19 +43,19 @@ def test_extract_urls_skips_python_comment_urls(tmp_path: Path) -> None:
file_path, LinkifyIt(options={"fuzzy_link": False})
)

assert urls == ["https://live.example/value"]
assert urls == ["https://live.suffolklitlab.org/value"]
assert concatenated == []


def test_extract_urls_skips_yaml_comment_urls(tmp_path: Path) -> None:
file_path = tmp_path / "example.yml"
file_path = tmp_path / "suffolklitlab.org.yml"
file_path.write_text(
"\n".join(
[
"# https://commented.example/full-line",
'live: "https://live.example/value"',
"# https://commented.suffolklitlab.org/full-line",
'live: "https://live.suffolklitlab.org/value"',
"note: keep # not-a-comment-inside-value",
"value: yes # https://commented.example/trailing",
"value: yes # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -53,21 +66,21 @@ def test_extract_urls_skips_yaml_comment_urls(tmp_path: Path) -> None:
file_path, LinkifyIt(options={"fuzzy_link": False})
)

assert urls == ["https://live.example/value"]
assert urls == ["https://live.suffolklitlab.org/value"]
assert concatenated == []


def test_extract_urls_keeps_urls_in_multiline_double_quoted_yaml_scalar(
tmp_path: Path,
) -> None:
file_path = tmp_path / "example.yml"
file_path = tmp_path / "suffolklitlab.org.yml"
file_path.write_text(
"\n".join(
[
'note: "first line',
" https://live.example/double-quoted",
" https://live.suffolklitlab.org/double-quoted",
' # still content"',
"field: value # https://commented.example/trailing",
"field: value # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -78,21 +91,21 @@ def test_extract_urls_keeps_urls_in_multiline_double_quoted_yaml_scalar(
file_path, LinkifyIt(options={"fuzzy_link": False})
)

assert urls == ["https://live.example/double-quoted"]
assert urls == ["https://live.suffolklitlab.org/double-quoted"]
assert concatenated == []


def test_extract_urls_keeps_urls_in_multiline_single_quoted_yaml_scalar(
tmp_path: Path,
) -> None:
file_path = tmp_path / "example.yml"
file_path = tmp_path / "suffolklitlab.org.yml"
file_path.write_text(
"\n".join(
[
"note: 'first line",
" https://live.example/single-quoted",
" https://live.suffolklitlab.org/single-quoted",
" it''s still content # not a comment'",
"field: value # https://commented.example/trailing",
"field: value # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -103,25 +116,25 @@ def test_extract_urls_keeps_urls_in_multiline_single_quoted_yaml_scalar(
file_path, LinkifyIt(options={"fuzzy_link": False})
)

assert urls == ["https://live.example/single-quoted"]
assert urls == ["https://live.suffolklitlab.org/single-quoted"]
assert concatenated == []


def test_extract_urls_keeps_markdown_heading_urls_in_yaml_block_scalar(
tmp_path: Path,
) -> None:
file_path = tmp_path / "example.yml"
file_path = tmp_path / "suffolklitlab.org.yml"
file_path.write_text(
"\n".join(
[
"question: |",
" # Heading",
" Visit https://live.example/question",
" Visit https://live.suffolklitlab.org/question",
"subquestion: |",
" ## Subheading https://live.example/subquestion",
" ## Subheading https://live.suffolklitlab.org/subquestion",
"note: |",
" ### Note https://live.example/note",
"field: value # https://commented.example/trailing",
" ### Note https://live.suffolklitlab.org/note",
"field: value # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -133,32 +146,32 @@ def test_extract_urls_keeps_markdown_heading_urls_in_yaml_block_scalar(
)

assert urls == [
"https://live.example/question",
"https://live.example/subquestion",
"https://live.example/note",
"https://live.suffolklitlab.org/question",
"https://live.suffolklitlab.org/subquestion",
"https://live.suffolklitlab.org/note",
]
assert concatenated == []


def test_extract_urls_keeps_markdown_heading_urls_in_template_and_attachment_blocks(
tmp_path: Path,
) -> None:
file_path = tmp_path / "example.yml"
file_path = tmp_path / "suffolklitlab.org.yml"
file_path.write_text(
"\n".join(
[
"template: review_email",
"subject: |",
" # Subject https://live.example/template-subject",
" # Subject https://live.suffolklitlab.org/template-subject",
"content: |",
" ## Content https://live.example/template-content",
" ## Content https://live.suffolklitlab.org/template-content",
"---",
"attachment:",
" name: review_letter",
" content: |",
" ### Attachment https://live.example/attachment-content",
" ### Attachment https://live.suffolklitlab.org/attachment-content",
" filename: review.pdf",
"metadata: yes # https://commented.example/trailing",
"metadata: yes # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -170,24 +183,24 @@ def test_extract_urls_keeps_markdown_heading_urls_in_template_and_attachment_blo
)

assert urls == [
"https://live.example/template-subject",
"https://live.example/template-content",
"https://live.example/attachment-content",
"https://live.suffolklitlab.org/template-subject",
"https://live.suffolklitlab.org/template-content",
"https://live.suffolklitlab.org/attachment-content",
]
assert concatenated == []


def test_extract_urls_skips_python_comment_urls_in_code_block_scalar(
tmp_path: Path,
) -> None:
file_path = tmp_path / "example.yml"
file_path = tmp_path / "suffolklitlab.org.yml"
file_path.write_text(
"\n".join(
[
"code: |",
" # https://commented.example/full-line",
' live = "https://live.example/value"',
" value = 1 # https://commented.example/trailing",
" # https://commented.suffolklitlab.org/full-line",
' live = "https://live.suffolklitlab.org/value"',
" value = 1 # https://commented.suffolklitlab.org/trailing",
"",
]
),
Expand All @@ -198,14 +211,14 @@ def test_extract_urls_skips_python_comment_urls_in_code_block_scalar(
file_path, LinkifyIt(options={"fuzzy_link": False})
)

assert urls == ["https://live.example/value"]
assert urls == ["https://live.suffolklitlab.org/value"]
assert concatenated == []


def test_extract_text_from_pdf_keeps_wrapped_urls_in_raw_text(
tmp_path: Path, monkeypatch
) -> None:
file_path = tmp_path / "example.pdf"
file_path = tmp_path / "suffolklitlab.org.pdf"
file_path.write_bytes(b"%PDF-1.4\n")

class FakePage:
Expand Down
37 changes: 37 additions & 0 deletions tests/test_is_reserved_example_domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest
from dayamlchecker.check_questions_urls import is_reserved_example_domain


@pytest.mark.parametrize(
"url,expected",
[
# RFC 2606 / RFC 6761 special-use example domains
("http://example.com", True),
("https://example.org", True),
("https://example.net", True),
("http://my.example.com", True),
("https://sub.sub.example.org", True),
("http://another.example.net", True),
("https://example.com/path", True),
# RFC 2606 / RFC 6761 special-use TLDs
("http://example", True),
("https://test", True),
("http://invalid", True),
("https://localhost", True),
("http://my.example", True),
("https://sub.test", True),
("http://something.invalid", True),
("https://dev.localhost", True),
# IANA-managed example domain, but not in the RFC 6761 registry
("http://example.edu", True),
("http://my.example.edu", True),
# Similar-looking real domains should still be checked
("https://notexample.com", False),
("https://example.com.untrusted.com", False),
("https://example.edu.untrusted.edu", False),
("https://google.com", False),
("https://suffolklitlab.org", False),
],
)
def test_is_reserved_example_domain(url: str, expected: bool) -> None:
assert is_reserved_example_domain(url) == expected
Loading