From 58edc0e07068a7f450ede679dd275434832c9429 Mon Sep 17 00:00:00 2001 From: fullsizemalt <106900403+fullsizemalt@users.noreply.github.com> Date: Wed, 24 Dec 2025 14:01:36 -0800 Subject: [PATCH] feat: add audio link discovery script --- backend/discover_audio_links.py | 219 ++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 backend/discover_audio_links.py diff --git a/backend/discover_audio_links.py b/backend/discover_audio_links.py new file mode 100644 index 0000000..16c0260 --- /dev/null +++ b/backend/discover_audio_links.py @@ -0,0 +1,219 @@ +""" +Show Audio Link Discovery Script + +This script discovers show-specific Bandcamp and Nugs.net URLs by: +1. For Bandcamp: Testing URL patterns based on date and venue +2. For Nugs: Scraping the Nugs catalog page to extract show URLs + +Run from backend container: python discover_audio_links.py +""" +import re +import requests +from datetime import datetime +from sqlmodel import Session, select +from database import engine +from models import Show, Venue +from slugify import generate_slug + +# Session headers to mimic browser +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' +} + +def slugify_for_bandcamp(text: str) -> str: + """Convert text to Bandcamp-friendly slug (lowercase, hyphens, no special chars)""" + text = text.lower() + text = re.sub(r'[^a-z0-9\s-]', '', text) + text = re.sub(r'[\s_]+', '-', text) + text = re.sub(r'-+', '-', text) + return text.strip('-') + +def check_url_exists(url: str) -> bool: + """Check if a URL returns 200 OK""" + try: + resp = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True) + return resp.status_code == 200 + except: + return False + +def discover_bandcamp_url(show: Show, venue: Venue) -> str | None: + """ + Try to find Bandcamp album URL for a show. + Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-venue-city-state + """ + if not venue: + return None + + date_str = show.date.strftime("%Y-%m-%d") + + # Build venue slug variations + venue_slugs = [] + + # Try: venue-city-state + if venue.city and venue.state: + venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}") + + # Try: city-state (some albums just use location) + if venue.city and venue.state: + venue_slugs.append(f"{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}") + + # Try: venue-city + if venue.city: + venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}") + + for venue_slug in venue_slugs: + url = f"https://goosetheband.bandcamp.com/album/{date_str}-{venue_slug}" + if check_url_exists(url): + return url + + return None + +def scrape_nugs_catalog() -> dict: + """ + Scrape the Nugs Goose catalog page to build a mapping of date -> URL. + Returns dict like {"2024-12-31": "https://www.nugs.net/..."} + """ + catalog_url = "https://play.nugs.net/artist/461/latest" + nugs_shows = {} + + try: + # The Nugs catalog is a SPA, so we need to use their API + # API endpoint for artist shows + api_url = "https://streamapi.nugs.net/api.aspx" + params = { + "method": "catalog.container", + "containerType": "artist", + "containerId": "461", # Goose's artist ID + } + + resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10) + if resp.status_code != 200: + print(f"Failed to fetch Nugs catalog: {resp.status_code}") + return {} + + # Parse response to extract shows + data = resp.json() + if "Response" in data and "Shows" in data["Response"]: + for show in data["Response"]["Shows"]: + perfDate = show.get("perfDate", "")[:10] # YYYY-MM-DD + showId = show.get("showID") + if perfDate and showId: + nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html" + + except Exception as e: + print(f"Error scraping Nugs: {e}") + + return nugs_shows + +def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None: + """ + Fallback: Try constructing Nugs URL patterns if API doesn't work. + """ + if not venue: + return None + + date_str = show.date.strftime("%Y-%m-%d") + date_compact = show.date.strftime("%Y%m%d") + + # Try common Nugs URL patterns + city_slug = slugify_for_bandcamp(venue.city) if venue.city else "" + state = venue.state.lower() if venue.state else "" + + patterns = [ + f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html", + f"https://www.nugs.net/live-download/goose-{date_compact}.html", + ] + + for url in patterns: + if check_url_exists(url): + return url + + return None + +def main(dry_run: bool = True, limit: int = None): + """ + Main discovery function. + + Args: + dry_run: If True, print changes but don't write to DB + limit: Max number of shows to process (for testing) + """ + print("=" * 60) + print("Show Audio Link Discovery") + print("=" * 60) + + with Session(engine) as session: + # Get all shows with venue info + query = select(Show).order_by(Show.date.desc()) + if limit: + query = query.limit(limit) + + shows = session.exec(query).all() + print(f"Found {len(shows)} shows to process") + + # Stats + bandcamp_found = 0 + nugs_found = 0 + already_have_bandcamp = 0 + already_have_nugs = 0 + + for show in shows: + venue = session.get(Venue, show.venue_id) if show.venue_id else None + date_str = show.date.strftime("%Y-%m-%d") + + # Skip if already has specific links (not the generic band-level ones) + generic_bandcamp = "goosetheband.bandcamp.com" in (show.bandcamp_link or "") and "/album/" not in (show.bandcamp_link or "") + generic_nugs = "utm_source=goose" in (show.nugs_link or "") + + # Bandcamp discovery + if not show.bandcamp_link or generic_bandcamp: + bc_url = discover_bandcamp_url(show, venue) + if bc_url: + print(f"✓ Bandcamp: {date_str} -> {bc_url}") + bandcamp_found += 1 + if not dry_run: + show.bandcamp_link = bc_url + session.add(show) + else: + already_have_bandcamp += 1 + + # Nugs discovery (using fallback for now) + if not show.nugs_link or generic_nugs: + nugs_url = discover_nugs_url_fallback(show, venue) + if nugs_url: + print(f"✓ Nugs: {date_str} -> {nugs_url}") + nugs_found += 1 + if not dry_run: + show.nugs_link = nugs_url + session.add(show) + else: + already_have_nugs += 1 + + if not dry_run: + session.commit() + + print("\n" + "=" * 60) + print("Summary") + print("=" * 60) + print(f"Shows processed: {len(shows)}") + print(f"Bandcamp links found: {bandcamp_found}") + print(f"Nugs links found: {nugs_found}") + print(f"Already had Bandcamp: {already_have_bandcamp}") + print(f"Already had Nugs: {already_have_nugs}") + + if dry_run: + print("\n[DRY RUN - No changes saved. Run with dry_run=False to save]") + +if __name__ == "__main__": + import sys + + dry_run = True + limit = 20 # Test with 20 shows first + + if len(sys.argv) > 1: + if "--save" in sys.argv: + dry_run = False + if "--all" in sys.argv: + limit = None + + main(dry_run=dry_run, limit=limit)