feat: add Bandcamp catalog import script

2025-12-24 14:21:11 -08:00 · 2025-12-24 14:21:11 -08:00 · ca28293cea
commit ca28293cea
parent d54b217264
1 changed files with 113 additions and 0 deletions
--- a/backend/import_bandcamp_catalog.py
+++ b/backend/import_bandcamp_catalog.py
@ -0,0 +1,113 @@
+"""
+Bandcamp Catalog Import Script
+
+Scrapes the Goose Bandcamp discography page to get all album URLs,
+then matches them to shows by date and updates the database.
+
+Run: python import_bandcamp_catalog.py [--save]
+"""
+import re
+import requests
+from datetime import datetime
+from sqlmodel import Session, select
+from database import engine
+from models import Show
+
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+}
+
+def scrape_bandcamp_catalog() -> dict:
+    """
+    Scrape the Goose Bandcamp music page for all album URLs.
+    Returns dict mapping date (YYYY-MM-DD) -> URL
+    """
+    music_url = "https://goosetheband.bandcamp.com/music"
+    albums = {}
+    
+    try:
+        resp = requests.get(music_url, headers=HEADERS, timeout=30)
+        if resp.status_code != 200:
+            print(f"Failed to fetch Bandcamp catalog: {resp.status_code}")
+            return {}
+        
+        html = resp.text
+        
+        # Extract album URLs and parse dates from them
+        # Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-...
+        url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+'
+        
+        for match in re.finditer(url_pattern, html):
+            url = match.group(0)
+            year, month, day = match.groups()
+            date_str = f"{year}-{month}-{day}"
+            
+            # Keep the first URL for each date (in case of duplicates)
+            if date_str not in albums:
+                albums[date_str] = url
+        
+        print(f"Scraped {len(albums)} Bandcamp album URLs")
+        return albums
+        
+    except Exception as e:
+        print(f"Error scraping Bandcamp: {e}")
+        return {}
+
+def main(dry_run: bool = True):
+    print("=" * 60)
+    print("Bandcamp Catalog Import")
+    print("=" * 60)
+    
+    # Scrape catalog
+    bandcamp_albums = scrape_bandcamp_catalog()
+    
+    if not bandcamp_albums:
+        print("No albums found, exiting.")
+        return
+    
+    with Session(engine) as session:
+        # Get all shows
+        shows = session.exec(select(Show)).all()
+        print(f"Found {len(shows)} shows in database")
+        
+        updated = 0
+        already_had = 0
+        not_found = 0
+        
+        for show in shows:
+            date_str = show.date.strftime("%Y-%m-%d")
+            
+            # Check if already has a proper album link
+            if show.bandcamp_link and "/album/" in show.bandcamp_link:
+                already_had += 1
+                continue
+            
+            # Look up in catalog
+            if date_str in bandcamp_albums:
+                new_url = bandcamp_albums[date_str]
+                print(f"✓ {date_str} -> {new_url}")
+                updated += 1
+                
+                if not dry_run:
+                    show.bandcamp_link = new_url
+                    session.add(show)
+            else:
+                not_found += 1
+        
+        if not dry_run:
+            session.commit()
+        
+        print("\n" + "=" * 60)
+        print("Summary")
+        print("=" * 60)
+        print(f"Shows with new Bandcamp link: {updated}")
+        print(f"Shows already had album link: {already_had}")
+        print(f"Shows not found in Bandcamp: {not_found}")
+        
+        if dry_run:
+            print("\n[DRY RUN - Run with --save to commit changes]")
+
+if __name__ == "__main__":
+    import sys
+    dry_run = "--save" not in sys.argv
+    main(dry_run=dry_run)