mirror of
https://github.com/coleam00/Archon
synced 2026-05-02 10:57:18 +00:00
222 lines
8.6 KiB
Python
222 lines
8.6 KiB
Python
|
|
"""
|
||
|
|
Test URL canonicalization in source ID generation.
|
||
|
|
|
||
|
|
This test ensures that URLs are properly normalized before hashing
|
||
|
|
to prevent duplicate sources from URL variations.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
from src.server.services.crawling.helpers.url_handler import URLHandler
|
||
|
|
|
||
|
|
|
||
|
|
class TestURLCanonicalization:
|
||
|
|
"""Test that URL canonicalization works correctly for source ID generation."""
|
||
|
|
|
||
|
|
def test_trailing_slash_normalization(self):
|
||
|
|
"""Test that trailing slashes are handled consistently."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
# These should generate the same ID
|
||
|
|
url1 = "https://example.com/path"
|
||
|
|
url2 = "https://example.com/path/"
|
||
|
|
|
||
|
|
id1 = handler.generate_unique_source_id(url1)
|
||
|
|
id2 = handler.generate_unique_source_id(url2)
|
||
|
|
|
||
|
|
assert id1 == id2, "URLs with/without trailing slash should generate same ID"
|
||
|
|
|
||
|
|
# Root path should keep its slash
|
||
|
|
root1 = "https://example.com"
|
||
|
|
root2 = "https://example.com/"
|
||
|
|
|
||
|
|
root_id1 = handler.generate_unique_source_id(root1)
|
||
|
|
root_id2 = handler.generate_unique_source_id(root2)
|
||
|
|
|
||
|
|
# These should be the same (both normalize to https://example.com/)
|
||
|
|
assert root_id1 == root_id2, "Root URLs should normalize consistently"
|
||
|
|
|
||
|
|
def test_fragment_removal(self):
|
||
|
|
"""Test that URL fragments are removed."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
urls = [
|
||
|
|
"https://example.com/page",
|
||
|
|
"https://example.com/page#section1",
|
||
|
|
"https://example.com/page#section2",
|
||
|
|
"https://example.com/page#",
|
||
|
|
]
|
||
|
|
|
||
|
|
ids = [handler.generate_unique_source_id(url) for url in urls]
|
||
|
|
|
||
|
|
# All should generate the same ID
|
||
|
|
assert len(set(ids)) == 1, "URLs with different fragments should generate same ID"
|
||
|
|
|
||
|
|
def test_tracking_param_removal(self):
|
||
|
|
"""Test that tracking parameters are removed."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
# URL without tracking params
|
||
|
|
clean_url = "https://example.com/page?important=value"
|
||
|
|
|
||
|
|
# URLs with various tracking params
|
||
|
|
tracked_urls = [
|
||
|
|
"https://example.com/page?important=value&utm_source=google",
|
||
|
|
"https://example.com/page?utm_medium=email&important=value",
|
||
|
|
"https://example.com/page?important=value&fbclid=abc123",
|
||
|
|
"https://example.com/page?gclid=xyz&important=value&utm_campaign=test",
|
||
|
|
"https://example.com/page?important=value&ref=homepage",
|
||
|
|
"https://example.com/page?source=newsletter&important=value",
|
||
|
|
]
|
||
|
|
|
||
|
|
clean_id = handler.generate_unique_source_id(clean_url)
|
||
|
|
tracked_ids = [handler.generate_unique_source_id(url) for url in tracked_urls]
|
||
|
|
|
||
|
|
# All tracked URLs should generate the same ID as the clean URL
|
||
|
|
for tracked_id in tracked_ids:
|
||
|
|
assert tracked_id == clean_id, "URLs with tracking params should match clean URL"
|
||
|
|
|
||
|
|
def test_query_param_sorting(self):
|
||
|
|
"""Test that query parameters are sorted for consistency."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
urls = [
|
||
|
|
"https://example.com/page?a=1&b=2&c=3",
|
||
|
|
"https://example.com/page?c=3&a=1&b=2",
|
||
|
|
"https://example.com/page?b=2&c=3&a=1",
|
||
|
|
]
|
||
|
|
|
||
|
|
ids = [handler.generate_unique_source_id(url) for url in urls]
|
||
|
|
|
||
|
|
# All should generate the same ID
|
||
|
|
assert len(set(ids)) == 1, "URLs with reordered query params should generate same ID"
|
||
|
|
|
||
|
|
def test_default_port_removal(self):
|
||
|
|
"""Test that default ports are removed."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
# HTTP default port (80)
|
||
|
|
http_urls = [
|
||
|
|
"http://example.com/page",
|
||
|
|
"http://example.com:80/page",
|
||
|
|
]
|
||
|
|
|
||
|
|
http_ids = [handler.generate_unique_source_id(url) for url in http_urls]
|
||
|
|
assert len(set(http_ids)) == 1, "HTTP URLs with/without :80 should generate same ID"
|
||
|
|
|
||
|
|
# HTTPS default port (443)
|
||
|
|
https_urls = [
|
||
|
|
"https://example.com/page",
|
||
|
|
"https://example.com:443/page",
|
||
|
|
]
|
||
|
|
|
||
|
|
https_ids = [handler.generate_unique_source_id(url) for url in https_urls]
|
||
|
|
assert len(set(https_ids)) == 1, "HTTPS URLs with/without :443 should generate same ID"
|
||
|
|
|
||
|
|
# Non-default ports should be preserved
|
||
|
|
url1 = "https://example.com:8080/page"
|
||
|
|
url2 = "https://example.com:9090/page"
|
||
|
|
|
||
|
|
id1 = handler.generate_unique_source_id(url1)
|
||
|
|
id2 = handler.generate_unique_source_id(url2)
|
||
|
|
|
||
|
|
assert id1 != id2, "URLs with different non-default ports should generate different IDs"
|
||
|
|
|
||
|
|
def test_case_normalization(self):
|
||
|
|
"""Test that scheme and domain are lowercased."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
urls = [
|
||
|
|
"https://example.com/Path/To/Page",
|
||
|
|
"HTTPS://EXAMPLE.COM/Path/To/Page",
|
||
|
|
"https://Example.Com/Path/To/Page",
|
||
|
|
"HTTPs://example.COM/Path/To/Page",
|
||
|
|
]
|
||
|
|
|
||
|
|
ids = [handler.generate_unique_source_id(url) for url in urls]
|
||
|
|
|
||
|
|
# All should generate the same ID (path case is preserved)
|
||
|
|
assert len(set(ids)) == 1, "URLs with different case in scheme/domain should generate same ID"
|
||
|
|
|
||
|
|
# But different paths should generate different IDs
|
||
|
|
path_urls = [
|
||
|
|
"https://example.com/path",
|
||
|
|
"https://example.com/Path",
|
||
|
|
"https://example.com/PATH",
|
||
|
|
]
|
||
|
|
|
||
|
|
path_ids = [handler.generate_unique_source_id(url) for url in path_urls]
|
||
|
|
|
||
|
|
# These should be different (path case matters)
|
||
|
|
assert len(set(path_ids)) == 3, "URLs with different path case should generate different IDs"
|
||
|
|
|
||
|
|
def test_complex_canonicalization(self):
|
||
|
|
"""Test complex URL with multiple normalizations needed."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
urls = [
|
||
|
|
"https://example.com/page",
|
||
|
|
"HTTPS://EXAMPLE.COM:443/page/",
|
||
|
|
"https://Example.com/page#section",
|
||
|
|
"https://example.com/page/?utm_source=test",
|
||
|
|
"https://example.com:443/page?utm_campaign=abc#footer",
|
||
|
|
]
|
||
|
|
|
||
|
|
ids = [handler.generate_unique_source_id(url) for url in urls]
|
||
|
|
|
||
|
|
# All should generate the same ID
|
||
|
|
assert len(set(ids)) == 1, "Complex URLs should normalize to same ID"
|
||
|
|
|
||
|
|
def test_edge_cases(self):
|
||
|
|
"""Test edge cases and error handling."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
# Empty URL
|
||
|
|
empty_id = handler.generate_unique_source_id("")
|
||
|
|
assert len(empty_id) == 16, "Empty URL should still generate valid ID"
|
||
|
|
|
||
|
|
# Invalid URL
|
||
|
|
invalid_id = handler.generate_unique_source_id("not-a-url")
|
||
|
|
assert len(invalid_id) == 16, "Invalid URL should still generate valid ID"
|
||
|
|
|
||
|
|
# URL with special characters
|
||
|
|
special_url = "https://example.com/page?key=value%20with%20spaces"
|
||
|
|
special_id = handler.generate_unique_source_id(special_url)
|
||
|
|
assert len(special_id) == 16, "URL with encoded chars should generate valid ID"
|
||
|
|
|
||
|
|
# Very long URL
|
||
|
|
long_url = "https://example.com/" + "a" * 1000
|
||
|
|
long_id = handler.generate_unique_source_id(long_url)
|
||
|
|
assert len(long_id) == 16, "Long URL should generate valid ID"
|
||
|
|
|
||
|
|
def test_preserves_important_params(self):
|
||
|
|
"""Test that non-tracking params are preserved."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
# These have different important params, should be different
|
||
|
|
url1 = "https://api.example.com/v1/users?page=1"
|
||
|
|
url2 = "https://api.example.com/v1/users?page=2"
|
||
|
|
|
||
|
|
id1 = handler.generate_unique_source_id(url1)
|
||
|
|
id2 = handler.generate_unique_source_id(url2)
|
||
|
|
|
||
|
|
assert id1 != id2, "URLs with different important params should generate different IDs"
|
||
|
|
|
||
|
|
# But tracking params should still be removed
|
||
|
|
url3 = "https://api.example.com/v1/users?page=1&utm_source=docs"
|
||
|
|
id3 = handler.generate_unique_source_id(url3)
|
||
|
|
|
||
|
|
assert id3 == id1, "Adding tracking params shouldn't change ID"
|
||
|
|
|
||
|
|
def test_local_file_paths(self):
|
||
|
|
"""Test handling of local file paths."""
|
||
|
|
handler = URLHandler()
|
||
|
|
|
||
|
|
# File URLs
|
||
|
|
file_url = "file:///Users/test/document.pdf"
|
||
|
|
file_id = handler.generate_unique_source_id(file_url)
|
||
|
|
assert len(file_id) == 16, "File URL should generate valid ID"
|
||
|
|
|
||
|
|
# Relative paths
|
||
|
|
relative_path = "../documents/file.txt"
|
||
|
|
relative_id = handler.generate_unique_source_id(relative_path)
|
||
|
|
assert len(relative_id) == 16, "Relative path should generate valid ID"
|