""" Show Audio Link Discovery Script This script discovers show-specific Bandcamp and Nugs.net URLs by: 1. For Bandcamp: Testing URL patterns based on date and venue 2. For Nugs: Scraping the Nugs catalog page to extract show URLs Run from backend container: python discover_audio_links.py """ import re import requests from datetime import datetime from sqlmodel import Session, select from database import engine from models import Show, Venue from slugify import generate_slug # Session headers to mimic browser HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } def slugify_for_bandcamp(text: str) -> str: """Convert text to Bandcamp-friendly slug (lowercase, hyphens, no special chars)""" text = text.lower() text = re.sub(r'[^a-z0-9\s-]', '', text) text = re.sub(r'[\s_]+', '-', text) text = re.sub(r'-+', '-', text) return text.strip('-') def check_url_exists(url: str) -> bool: """Check if a URL returns 200 OK""" try: resp = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True) return resp.status_code == 200 except: return False def discover_bandcamp_url(show: Show, venue: Venue) -> str | None: """ Try to find Bandcamp album URL for a show. Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-venue-city-state """ if not venue: return None date_str = show.date.strftime("%Y-%m-%d") # Build venue slug variations venue_slugs = [] # Try: venue-city-state if venue.city and venue.state: venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}") # Try: city-state (some albums just use location) if venue.city and venue.state: venue_slugs.append(f"{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}") # Try: venue-city if venue.city: venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}") for venue_slug in venue_slugs: url = f"https://goosetheband.bandcamp.com/album/{date_str}-{venue_slug}" if check_url_exists(url): return url return None def scrape_nugs_catalog() -> dict: """ Scrape the Nugs Goose catalog page to build a mapping of date -> URL. Returns dict like {"2024-12-31": "https://www.nugs.net/..."} """ catalog_url = "https://play.nugs.net/artist/461/latest" nugs_shows = {} try: # The Nugs catalog is a SPA, so we need to use their API # API endpoint for artist shows api_url = "https://streamapi.nugs.net/api.aspx" params = { "method": "catalog.container", "containerType": "artist", "containerId": "461", # Goose's artist ID } resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10) if resp.status_code != 200: print(f"Failed to fetch Nugs catalog: {resp.status_code}") return {} # Parse response to extract shows data = resp.json() if "Response" in data and "Shows" in data["Response"]: for show in data["Response"]["Shows"]: perfDate = show.get("perfDate", "")[:10] # YYYY-MM-DD showId = show.get("showID") if perfDate and showId: nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html" except Exception as e: print(f"Error scraping Nugs: {e}") return nugs_shows def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None: """ Fallback: Try constructing Nugs URL patterns if API doesn't work. """ if not venue: return None date_str = show.date.strftime("%Y-%m-%d") date_compact = show.date.strftime("%Y%m%d") # Try common Nugs URL patterns city_slug = slugify_for_bandcamp(venue.city) if venue.city else "" state = venue.state.lower() if venue.state else "" patterns = [ f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html", f"https://www.nugs.net/live-download/goose-{date_compact}.html", ] for url in patterns: if check_url_exists(url): return url return None def main(dry_run: bool = True, limit: int = None): """ Main discovery function. Args: dry_run: If True, print changes but don't write to DB limit: Max number of shows to process (for testing) """ print("=" * 60) print("Show Audio Link Discovery") print("=" * 60) with Session(engine) as session: # Get all shows with venue info query = select(Show).order_by(Show.date.desc()) if limit: query = query.limit(limit) shows = session.exec(query).all() print(f"Found {len(shows)} shows to process") # Stats bandcamp_found = 0 nugs_found = 0 already_have_bandcamp = 0 already_have_nugs = 0 for show in shows: venue = session.get(Venue, show.venue_id) if show.venue_id else None date_str = show.date.strftime("%Y-%m-%d") # Skip if already has specific links (not the generic band-level ones) generic_bandcamp = "goosetheband.bandcamp.com" in (show.bandcamp_link or "") and "/album/" not in (show.bandcamp_link or "") generic_nugs = "utm_source=goose" in (show.nugs_link or "") # Bandcamp discovery if not show.bandcamp_link or generic_bandcamp: bc_url = discover_bandcamp_url(show, venue) if bc_url: print(f"✓ Bandcamp: {date_str} -> {bc_url}") bandcamp_found += 1 if not dry_run: show.bandcamp_link = bc_url session.add(show) else: already_have_bandcamp += 1 # Nugs discovery (using fallback for now) if not show.nugs_link or generic_nugs: nugs_url = discover_nugs_url_fallback(show, venue) if nugs_url: print(f"✓ Nugs: {date_str} -> {nugs_url}") nugs_found += 1 if not dry_run: show.nugs_link = nugs_url session.add(show) else: already_have_nugs += 1 if not dry_run: session.commit() print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f"Shows processed: {len(shows)}") print(f"Bandcamp links found: {bandcamp_found}") print(f"Nugs links found: {nugs_found}") print(f"Already had Bandcamp: {already_have_bandcamp}") print(f"Already had Nugs: {already_have_nugs}") if dry_run: print("\n[DRY RUN - No changes saved. Run with dry_run=False to save]") if __name__ == "__main__": import sys dry_run = True limit = 20 # Test with 20 shows first if len(sys.argv) > 1: if "--save" in sys.argv: dry_run = False if "--all" in sys.argv: limit = None main(dry_run=dry_run, limit=limit)