fix: correct Bandcamp regex pattern
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run

This commit is contained in:
fullsizemalt 2025-12-24 14:23:07 -08:00
parent ca28293cea
commit 26171e1937

View file

@ -33,18 +33,19 @@ def scrape_bandcamp_catalog() -> dict:
html = resp.text html = resp.text
# Extract album URLs and parse dates from them # Extract album URLs - pattern: album/YYYY-MM-DD-rest-of-slug
# Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-... # The URLs appear as relative paths like: album/2025-12-13-goosemas-show-upon-time...
url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+' url_pattern = r'album/(\d{4})-(\d{2})-(\d{2})-[a-z0-9-]+'
for match in re.finditer(url_pattern, html): for match in re.finditer(url_pattern, html):
url = match.group(0) album_path = match.group(0)
year, month, day = match.groups() year, month, day = match.groups()
date_str = f"{year}-{month}-{day}" date_str = f"{year}-{month}-{day}"
full_url = f"https://goosetheband.bandcamp.com/{album_path}"
# Keep the first URL for each date (in case of duplicates) # Keep the first URL for each date (in case of duplicates)
if date_str not in albums: if date_str not in albums:
albums[date_str] = url albums[date_str] = full_url
print(f"Scraped {len(albums)} Bandcamp album URLs") print(f"Scraped {len(albums)} Bandcamp album URLs")
return albums return albums