From 26171e1937a43a91f4257529217d79af3e2a8746 Mon Sep 17 00:00:00 2001 From: fullsizemalt <106900403+fullsizemalt@users.noreply.github.com> Date: Wed, 24 Dec 2025 14:23:07 -0800 Subject: [PATCH] fix: correct Bandcamp regex pattern --- backend/import_bandcamp_catalog.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/import_bandcamp_catalog.py b/backend/import_bandcamp_catalog.py index 2235abc..f284269 100644 --- a/backend/import_bandcamp_catalog.py +++ b/backend/import_bandcamp_catalog.py @@ -33,18 +33,19 @@ def scrape_bandcamp_catalog() -> dict: html = resp.text - # Extract album URLs and parse dates from them - # Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-... - url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+' + # Extract album URLs - pattern: album/YYYY-MM-DD-rest-of-slug + # The URLs appear as relative paths like: album/2025-12-13-goosemas-show-upon-time... + url_pattern = r'album/(\d{4})-(\d{2})-(\d{2})-[a-z0-9-]+' for match in re.finditer(url_pattern, html): - url = match.group(0) + album_path = match.group(0) year, month, day = match.groups() date_str = f"{year}-{month}-{day}" + full_url = f"https://goosetheband.bandcamp.com/{album_path}" # Keep the first URL for each date (in case of duplicates) if date_str not in albums: - albums[date_str] = url + albums[date_str] = full_url print(f"Scraped {len(albums)} Bandcamp album URLs") return albums