""" Show Audio Link Discovery Script This script discovers show-specific Bandcamp and Nugs.net URLs by: 1. For Bandcamp: Testing URL patterns based on date and venue 2. For Nugs: Scraping the Nugs catalog page to extract show URLs Run from backend container: python discover_audio_links.py """ import re import requests from datetime import datetime from sqlmodel import Session, select from database import engine from models import Show, Venue from slugify import generate_slug # Session headers to mimic browser HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } def slugify_for_bandcamp(text: str) -> str: """Convert text to Bandcamp-friendly slug (lowercase, hyphens, no special chars)""" text = text.lower() text = re.sub(r'[^a-z0-9\s-]', '', text) text = re.sub(r'[\s_]+', '-', text) text = re.sub(r'-+', '-', text) return text.strip('-') def check_url_exists(url: str) -> bool: """Check if a URL returns 200 OK""" try: resp = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True) return resp.status_code == 200 except: return False def discover_bandcamp_url(show: Show, venue: Venue) -> str | None: """ Try to find Bandcamp album URL for a show. Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-venue-city-state """ if not venue: return None date_str = show.date.strftime("%Y-%m-%d") # Build venue slug variations venue_slugs = [] # Try: venue-city-state if venue.city and venue.state: venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}") # Try: city-state (some albums just use location) if venue.city and venue.state: venue_slugs.append(f"{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}") # Try: venue-city if venue.city: venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}") for venue_slug in venue_slugs: url = f"https://goosetheband.bandcamp.com/album/{date_str}-{venue_slug}" if check_url_exists(url): return url return None def scrape_nugs_catalog() -> dict: """ Scrape the Nugs Goose catalog page to build a mapping of date -> URL. Returns dict like {"2024-12-31": "https://www.nugs.net/..."} """ catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/" nugs_shows = {} try: resp = requests.get(catalog_url, headers=HEADERS, timeout=30) if resp.status_code != 200: print(f"Failed to fetch Nugs catalog: {resp.status_code}") return {} html = resp.text # Extract URLs and dates using regex # URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)' urls = re.findall(url_pattern, html) # Date pattern in URL: MM-DD-YYYY (before mp3-flac...) for url in set(urls): # deduplicate # Try to extract date from URL date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url) if date_match: month, day, year = date_match.groups() date_str = f"{year}-{int(month):02d}-{int(day):02d}" nugs_shows[date_str] = url print(f" Scraped {len(nugs_shows)} Nugs show URLs from catalog") except Exception as e: print(f"Error scraping Nugs: {e}") return nugs_shows # Global cache for Nugs catalog _nugs_catalog = None def get_nugs_catalog(): """Get or build Nugs catalog cache""" global _nugs_catalog if _nugs_catalog is None: print("Building Nugs catalog from website...") _nugs_catalog = scrape_nugs_catalog() return _nugs_catalog def discover_nugs_url(show: Show) -> str | None: """ Look up Nugs URL from catalog by date. """ date_str = show.date.strftime("%Y-%m-%d") catalog = get_nugs_catalog() return catalog.get(date_str) def main(dry_run: bool = True, limit: int = None): """ Main discovery function. Args: dry_run: If True, print changes but don't write to DB limit: Max number of shows to process (for testing) """ print("=" * 60) print("Show Audio Link Discovery") print("=" * 60) with Session(engine) as session: # Get all shows with venue info query = select(Show).order_by(Show.date.desc()) if limit: query = query.limit(limit) shows = session.exec(query).all() print(f"Found {len(shows)} shows to process") # Stats bandcamp_found = 0 nugs_found = 0 already_have_bandcamp = 0 already_have_nugs = 0 for show in shows: venue = session.get(Venue, show.venue_id) if show.venue_id else None date_str = show.date.strftime("%Y-%m-%d") # Skip if already has specific links (not the generic band-level ones) generic_bandcamp = "goosetheband.bandcamp.com" in (show.bandcamp_link or "") and "/album/" not in (show.bandcamp_link or "") generic_nugs = "utm_source=goose" in (show.nugs_link or "") # Bandcamp discovery if not show.bandcamp_link or generic_bandcamp: bc_url = discover_bandcamp_url(show, venue) if bc_url: print(f"✓ Bandcamp: {date_str} -> {bc_url}") bandcamp_found += 1 if not dry_run: show.bandcamp_link = bc_url session.add(show) else: already_have_bandcamp += 1 # Nugs discovery if not show.nugs_link or generic_nugs: nugs_url = discover_nugs_url(show) if nugs_url: print(f"✓ Nugs: {date_str} -> {nugs_url}") nugs_found += 1 if not dry_run: show.nugs_link = nugs_url session.add(show) else: already_have_nugs += 1 if not dry_run: session.commit() print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f"Shows processed: {len(shows)}") print(f"Bandcamp links found: {bandcamp_found}") print(f"Nugs links found: {nugs_found}") print(f"Already had Bandcamp: {already_have_bandcamp}") print(f"Already had Nugs: {already_have_nugs}") if dry_run: print("\n[DRY RUN - No changes saved. Run with dry_run=False to save]") if __name__ == "__main__": import sys dry_run = True limit = 20 # Test with 20 shows first if len(sys.argv) > 1: if "--save" in sys.argv: dry_run = False if "--all" in sys.argv: limit = None main(dry_run=dry_run, limit=limit)