From 26171e1937a43a91f4257529217d79af3e2a8746 Mon Sep 17 00:00:00 2001
From: fullsizemalt <106900403+fullsizemalt@users.noreply.github.com>
Date: Wed, 24 Dec 2025 14:23:07 -0800
Subject: [PATCH] fix: correct Bandcamp regex pattern

---
 backend/import_bandcamp_catalog.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/backend/import_bandcamp_catalog.py b/backend/import_bandcamp_catalog.py
index 2235abc..f284269 100644
--- a/backend/import_bandcamp_catalog.py
+++ b/backend/import_bandcamp_catalog.py
@@ -33,18 +33,19 @@ def scrape_bandcamp_catalog() -> dict:
         
         html = resp.text
         
-        # Extract album URLs and parse dates from them
-        # Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-...
-        url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+'
+        # Extract album URLs - pattern: album/YYYY-MM-DD-rest-of-slug
+        # The URLs appear as relative paths like: album/2025-12-13-goosemas-show-upon-time...
+        url_pattern = r'album/(\d{4})-(\d{2})-(\d{2})-[a-z0-9-]+'
         
         for match in re.finditer(url_pattern, html):
-            url = match.group(0)
+            album_path = match.group(0)
             year, month, day = match.groups()
             date_str = f"{year}-{month}-{day}"
+            full_url = f"https://goosetheband.bandcamp.com/{album_path}"
             
             # Keep the first URL for each date (in case of duplicates)
             if date_str not in albums:
-                albums[date_str] = url
+                albums[date_str] = full_url
         
         print(f"Scraped {len(albums)} Bandcamp album URLs")
         return albums