fediversion/backend/discover_audio_links.py

"""
Show Audio Link Discovery Script

This script discovers show-specific Bandcamp and Nugs.net URLs by:
1. For Bandcamp: Testing URL patterns based on date and venue
2. For Nugs: Scraping the Nugs catalog page to extract show URLs

Run from backend container: python discover_audio_links.py
"""
import re
import requests
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Show, Venue
from slugify import generate_slug

# Session headers to mimic browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}

def slugify_for_bandcamp(text: str) -> str:
    """Convert text to Bandcamp-friendly slug (lowercase, hyphens, no special chars)"""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s-]', '', text)
    text = re.sub(r'[\s_]+', '-', text)
    text = re.sub(r'-+', '-', text)
    return text.strip('-')

def check_url_exists(url: str) -> bool:
    """Check if a URL returns 200 OK"""
    try:
        resp = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True)
        return resp.status_code == 200
    except:
        return False

def discover_bandcamp_url(show: Show, venue: Venue) -> str | None:
    """
    Try to find Bandcamp album URL for a show.
    Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-venue-city-state
    """
    if not venue:
        return None

    date_str = show.date.strftime("%Y-%m-%d")

    # Build venue slug variations
    venue_slugs = []

    # Try: venue-city-state
    if venue.city and venue.state:
        venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}")

    # Try: city-state (some albums just use location)
    if venue.city and venue.state:
        venue_slugs.append(f"{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}")

    # Try: venue-city
    if venue.city:
        venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}")

    for venue_slug in venue_slugs:
        url = f"https://goosetheband.bandcamp.com/album/{date_str}-{venue_slug}"
        if check_url_exists(url):
            return url

    return None

def scrape_nugs_catalog() -> dict:
    """
    Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
    Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
    """
    catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/"
    nugs_shows = {}

    try:
        resp = requests.get(catalog_url, headers=HEADERS, timeout=30)
        if resp.status_code != 200:
            print(f"Failed to fetch Nugs catalog: {resp.status_code}")
            return {}

        html = resp.text

        # Extract URLs and dates using regex
        # URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html
        url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)'
        urls = re.findall(url_pattern, html)

        # Date pattern in URL: MM-DD-YYYY (before mp3-flac...)
        for url in set(urls):  # deduplicate
            # Try to extract date from URL
            date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url)
            if date_match:
                month, day, year = date_match.groups()
                date_str = f"{year}-{int(month):02d}-{int(day):02d}"
                nugs_shows[date_str] = url

        print(f"  Scraped {len(nugs_shows)} Nugs show URLs from catalog")

    except Exception as e:
        print(f"Error scraping Nugs: {e}")

    return nugs_shows

# Global cache for Nugs catalog
_nugs_catalog = None

def get_nugs_catalog():
    """Get or build Nugs catalog cache"""
    global _nugs_catalog
    if _nugs_catalog is None:
        print("Building Nugs catalog from website...")
        _nugs_catalog = scrape_nugs_catalog()
    return _nugs_catalog

def discover_nugs_url(show: Show) -> str | None:
    """
    Look up Nugs URL from catalog by date.
    """
    date_str = show.date.strftime("%Y-%m-%d")
    catalog = get_nugs_catalog()
    return catalog.get(date_str)

def main(dry_run: bool = True, limit: int = None):
    """
    Main discovery function.

    Args:
        dry_run: If True, print changes but don't write to DB
        limit: Max number of shows to process (for testing)
    """
    print("=" * 60)
    print("Show Audio Link Discovery")
    print("=" * 60)

    with Session(engine) as session:
        # Get all shows with venue info
        query = select(Show).order_by(Show.date.desc())
        if limit:
            query = query.limit(limit)

        shows = session.exec(query).all()
        print(f"Found {len(shows)} shows to process")

        # Stats
        bandcamp_found = 0
        nugs_found = 0
        already_have_bandcamp = 0
        already_have_nugs = 0

        for show in shows:
            venue = session.get(Venue, show.venue_id) if show.venue_id else None
            date_str = show.date.strftime("%Y-%m-%d")

            # Skip if already has specific links (not the generic band-level ones)
            generic_bandcamp = "goosetheband.bandcamp.com" in (show.bandcamp_link or "") and "/album/" not in (show.bandcamp_link or "")
            generic_nugs = "utm_source=goose" in (show.nugs_link or "")

            # Bandcamp discovery
            if not show.bandcamp_link or generic_bandcamp:
                bc_url = discover_bandcamp_url(show, venue)
                if bc_url:
                    print(f"✓ Bandcamp: {date_str} -> {bc_url}")
                    bandcamp_found += 1
                    if not dry_run:
                        show.bandcamp_link = bc_url
                        session.add(show)
            else:
                already_have_bandcamp += 1

            # Nugs discovery
            if not show.nugs_link or generic_nugs:
                nugs_url = discover_nugs_url(show)
                if nugs_url:
                    print(f"✓ Nugs: {date_str} -> {nugs_url}")
                    nugs_found += 1
                    if not dry_run:
                        show.nugs_link = nugs_url
                        session.add(show)
            else:
                already_have_nugs += 1

        if not dry_run:
            session.commit()

        print("\n" + "=" * 60)
        print("Summary")
        print("=" * 60)
        print(f"Shows processed: {len(shows)}")
        print(f"Bandcamp links found: {bandcamp_found}")
        print(f"Nugs links found: {nugs_found}")
        print(f"Already had Bandcamp: {already_have_bandcamp}")
        print(f"Already had Nugs: {already_have_nugs}")

        if dry_run:
            print("\n[DRY RUN - No changes saved. Run with dry_run=False to save]")

if __name__ == "__main__":
    import sys

    dry_run = True
    limit = 20  # Test with 20 shows first

    if len(sys.argv) > 1:
        if "--save" in sys.argv:
            dry_run = False
        if "--all" in sys.argv:
            limit = None

    main(dry_run=dry_run, limit=limit)