From ca28293cea311ce8aa5a4f728a3bb47bc83e1bbf Mon Sep 17 00:00:00 2001 From: fullsizemalt <106900403+fullsizemalt@users.noreply.github.com> Date: Wed, 24 Dec 2025 14:21:11 -0800 Subject: [PATCH] feat: add Bandcamp catalog import script --- backend/import_bandcamp_catalog.py | 113 +++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 backend/import_bandcamp_catalog.py diff --git a/backend/import_bandcamp_catalog.py b/backend/import_bandcamp_catalog.py new file mode 100644 index 0000000..2235abc --- /dev/null +++ b/backend/import_bandcamp_catalog.py @@ -0,0 +1,113 @@ +""" +Bandcamp Catalog Import Script + +Scrapes the Goose Bandcamp discography page to get all album URLs, +then matches them to shows by date and updates the database. + +Run: python import_bandcamp_catalog.py [--save] +""" +import re +import requests +from datetime import datetime +from sqlmodel import Session, select +from database import engine +from models import Show + +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' +} + +def scrape_bandcamp_catalog() -> dict: + """ + Scrape the Goose Bandcamp music page for all album URLs. + Returns dict mapping date (YYYY-MM-DD) -> URL + """ + music_url = "https://goosetheband.bandcamp.com/music" + albums = {} + + try: + resp = requests.get(music_url, headers=HEADERS, timeout=30) + if resp.status_code != 200: + print(f"Failed to fetch Bandcamp catalog: {resp.status_code}") + return {} + + html = resp.text + + # Extract album URLs and parse dates from them + # Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-... + url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+' + + for match in re.finditer(url_pattern, html): + url = match.group(0) + year, month, day = match.groups() + date_str = f"{year}-{month}-{day}" + + # Keep the first URL for each date (in case of duplicates) + if date_str not in albums: + albums[date_str] = url + + print(f"Scraped {len(albums)} Bandcamp album URLs") + return albums + + except Exception as e: + print(f"Error scraping Bandcamp: {e}") + return {} + +def main(dry_run: bool = True): + print("=" * 60) + print("Bandcamp Catalog Import") + print("=" * 60) + + # Scrape catalog + bandcamp_albums = scrape_bandcamp_catalog() + + if not bandcamp_albums: + print("No albums found, exiting.") + return + + with Session(engine) as session: + # Get all shows + shows = session.exec(select(Show)).all() + print(f"Found {len(shows)} shows in database") + + updated = 0 + already_had = 0 + not_found = 0 + + for show in shows: + date_str = show.date.strftime("%Y-%m-%d") + + # Check if already has a proper album link + if show.bandcamp_link and "/album/" in show.bandcamp_link: + already_had += 1 + continue + + # Look up in catalog + if date_str in bandcamp_albums: + new_url = bandcamp_albums[date_str] + print(f"✓ {date_str} -> {new_url}") + updated += 1 + + if not dry_run: + show.bandcamp_link = new_url + session.add(show) + else: + not_found += 1 + + if not dry_run: + session.commit() + + print("\n" + "=" * 60) + print("Summary") + print("=" * 60) + print(f"Shows with new Bandcamp link: {updated}") + print(f"Shows already had album link: {already_had}") + print(f"Shows not found in Bandcamp: {not_found}") + + if dry_run: + print("\n[DRY RUN - Run with --save to commit changes]") + +if __name__ == "__main__": + import sys + dry_run = "--save" not in sys.argv + main(dry_run=dry_run)