fix: update Nugs discovery to scrape catalog page

2025-12-24 14:04:10 -08:00 · 2025-12-24 14:04:10 -08:00 · d54b217264
commit d54b217264
parent 58edc0e070
1 changed files with 36 additions and 42 deletions
--- a/backend/discover_audio_links.py
+++ b/backend/discover_audio_links.py
@ -73,62 +73,56 @@ def scrape_nugs_catalog() -> dict:
    Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
    Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
    """
-    catalog_url = "https://play.nugs.net/artist/461/latest"
+    catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/"
    nugs_shows = {}
    try:
-        # The Nugs catalog is a SPA, so we need to use their API
+        resp = requests.get(catalog_url, headers=HEADERS, timeout=30)
        # API endpoint for artist shows
        api_url = "https://streamapi.nugs.net/api.aspx"
        params = {
            "method": "catalog.container",
            "containerType": "artist",
            "containerId": "461",  # Goose's artist ID
        }
        resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
        if resp.status_code != 200:
            print(f"Failed to fetch Nugs catalog: {resp.status_code}")
            return {}
-        # Parse response to extract shows
+        html = resp.text
-        data = resp.json()
+        
-        if "Response" in data and "Shows" in data["Response"]:
+        # Extract URLs and dates using regex
-            for show in data["Response"]["Shows"]:
+        # URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html
-                perfDate = show.get("perfDate", "")[:10]  # YYYY-MM-DD
+        url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)'
-                showId = show.get("showID")
+        urls = re.findall(url_pattern, html)
-                if perfDate and showId:
+        
-                    nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html"
+        # Date pattern in URL: MM-DD-YYYY (before mp3-flac...)
        for url in set(urls):  # deduplicate
            # Try to extract date from URL
            date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url)
            if date_match:
                month, day, year = date_match.groups()
                date_str = f"{year}-{int(month):02d}-{int(day):02d}"
                nugs_shows[date_str] = url
        print(f"  Scraped {len(nugs_shows)} Nugs show URLs from catalog")
    except Exception as e:
        print(f"Error scraping Nugs: {e}")
    return nugs_shows
-def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None:
+# Global cache for Nugs catalog
 _nugs_catalog = None
 def get_nugs_catalog():
    """Get or build Nugs catalog cache"""
    global _nugs_catalog
    if _nugs_catalog is None:
        print("Building Nugs catalog from website...")
        _nugs_catalog = scrape_nugs_catalog()
    return _nugs_catalog
 def discover_nugs_url(show: Show) -> str | None:
    """
-    Fallback: Try constructing Nugs URL patterns if API doesn't work.
+    Look up Nugs URL from catalog by date.
    """
    if not venue:
        return None
    date_str = show.date.strftime("%Y-%m-%d")
-    date_compact = show.date.strftime("%Y%m%d")
+    catalog = get_nugs_catalog()
-    
+    return catalog.get(date_str)
    # Try common Nugs URL patterns
    city_slug = slugify_for_bandcamp(venue.city) if venue.city else ""
    state = venue.state.lower() if venue.state else ""
    patterns = [
        f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html",
        f"https://www.nugs.net/live-download/goose-{date_compact}.html",
    ]
    for url in patterns:
        if check_url_exists(url):
            return url
    return None
 def main(dry_run: bool = True, limit: int = None):
    """
@ -177,9 +171,9 @@ def main(dry_run: bool = True, limit: int = None):
            else:
                already_have_bandcamp += 1
-            # Nugs discovery (using fallback for now)
+            # Nugs discovery
            if not show.nugs_link or generic_nugs:
-                nugs_url = discover_nugs_url_fallback(show, venue)
+                nugs_url = discover_nugs_url(show)
                if nugs_url:
                    print(f"✓ Nugs: {date_str} -> {nugs_url}")
                    nugs_found += 1