""" Bandcamp Catalog Import Script Scrapes the Goose Bandcamp discography page to get all album URLs, then matches them to shows by date and updates the database. Run: python import_bandcamp_catalog.py [--save] """ import re import requests from datetime import datetime from sqlmodel import Session, select from database import engine from models import Show HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } def scrape_bandcamp_catalog() -> dict: """ Scrape the Goose Bandcamp music page for all album URLs. Returns dict mapping date (YYYY-MM-DD) -> URL """ music_url = "https://goosetheband.bandcamp.com/music" albums = {} try: resp = requests.get(music_url, headers=HEADERS, timeout=30) if resp.status_code != 200: print(f"Failed to fetch Bandcamp catalog: {resp.status_code}") return {} html = resp.text # Extract album URLs and parse dates from them # Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-... url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+' for match in re.finditer(url_pattern, html): url = match.group(0) year, month, day = match.groups() date_str = f"{year}-{month}-{day}" # Keep the first URL for each date (in case of duplicates) if date_str not in albums: albums[date_str] = url print(f"Scraped {len(albums)} Bandcamp album URLs") return albums except Exception as e: print(f"Error scraping Bandcamp: {e}") return {} def main(dry_run: bool = True): print("=" * 60) print("Bandcamp Catalog Import") print("=" * 60) # Scrape catalog bandcamp_albums = scrape_bandcamp_catalog() if not bandcamp_albums: print("No albums found, exiting.") return with Session(engine) as session: # Get all shows shows = session.exec(select(Show)).all() print(f"Found {len(shows)} shows in database") updated = 0 already_had = 0 not_found = 0 for show in shows: date_str = show.date.strftime("%Y-%m-%d") # Check if already has a proper album link if show.bandcamp_link and "/album/" in show.bandcamp_link: already_had += 1 continue # Look up in catalog if date_str in bandcamp_albums: new_url = bandcamp_albums[date_str] print(f"✓ {date_str} -> {new_url}") updated += 1 if not dry_run: show.bandcamp_link = new_url session.add(show) else: not_found += 1 if not dry_run: session.commit() print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f"Shows with new Bandcamp link: {updated}") print(f"Shows already had album link: {already_had}") print(f"Shows not found in Bandcamp: {not_found}") if dry_run: print("\n[DRY RUN - Run with --save to commit changes]") if __name__ == "__main__": import sys dry_run = "--save" not in sys.argv main(dry_run=dry_run)