fix: update Nugs discovery to scrape catalog page

2025-12-24 14:04:10 -08:00 · 2025-12-24 14:04:10 -08:00 · d54b217264
commit d54b217264
parent 58edc0e070
1 changed files with 36 additions and 42 deletions
--- a/backend/discover_audio_links.py
+++ b/backend/discover_audio_links.py
@ -73,62 +73,56 @@ def scrape_nugs_catalog() -> dict:
    Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
    Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
    """
-    catalog_url = "https://play.nugs.net/artist/461/latest"
+    catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/"
    nugs_shows = {}
    
    try:
-        # The Nugs catalog is a SPA, so we need to use their API
-        # API endpoint for artist shows
-        api_url = "https://streamapi.nugs.net/api.aspx"
-        params = {
-            "method": "catalog.container",
-            "containerType": "artist",
-            "containerId": "461",  # Goose's artist ID
-        }
-        
-        resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
+        resp = requests.get(catalog_url, headers=HEADERS, timeout=30)
        if resp.status_code != 200:
            print(f"Failed to fetch Nugs catalog: {resp.status_code}")
            return {}
        
-        # Parse response to extract shows
-        data = resp.json()
-        if "Response" in data and "Shows" in data["Response"]:
-            for show in data["Response"]["Shows"]:
-                perfDate = show.get("perfDate", "")[:10]  # YYYY-MM-DD
-                showId = show.get("showID")
-                if perfDate and showId:
-                    nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html"
+        html = resp.text
+        
+        # Extract URLs and dates using regex
+        # URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html
+        url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)'
+        urls = re.findall(url_pattern, html)
+        
+        # Date pattern in URL: MM-DD-YYYY (before mp3-flac...)
+        for url in set(urls):  # deduplicate
+            # Try to extract date from URL
+            date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url)
+            if date_match:
+                month, day, year = date_match.groups()
+                date_str = f"{year}-{int(month):02d}-{int(day):02d}"
+                nugs_shows[date_str] = url
+        
+        print(f"  Scraped {len(nugs_shows)} Nugs show URLs from catalog")
        
    except Exception as e:
        print(f"Error scraping Nugs: {e}")
    
    return nugs_shows

-def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None:
-    """
-    Fallback: Try constructing Nugs URL patterns if API doesn't work.
-    """
-    if not venue:
-        return None
+# Global cache for Nugs catalog
+_nugs_catalog = None

+def get_nugs_catalog():
+    """Get or build Nugs catalog cache"""
+    global _nugs_catalog
+    if _nugs_catalog is None:
+        print("Building Nugs catalog from website...")
+        _nugs_catalog = scrape_nugs_catalog()
+    return _nugs_catalog
+
+def discover_nugs_url(show: Show) -> str | None:
+    """
+    Look up Nugs URL from catalog by date.
+    """
    date_str = show.date.strftime("%Y-%m-%d")
-    date_compact = show.date.strftime("%Y%m%d")
-    
-    # Try common Nugs URL patterns
-    city_slug = slugify_for_bandcamp(venue.city) if venue.city else ""
-    state = venue.state.lower() if venue.state else ""
-    
-    patterns = [
-        f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html",
-        f"https://www.nugs.net/live-download/goose-{date_compact}.html",
-    ]
-    
-    for url in patterns:
-        if check_url_exists(url):
-            return url
-    
-    return None
+    catalog = get_nugs_catalog()
+    return catalog.get(date_str)

 def main(dry_run: bool = True, limit: int = None):
    """
@ -177,9 +171,9 @@ def main(dry_run: bool = True, limit: int = None):
            else:
                already_have_bandcamp += 1
            
-            # Nugs discovery (using fallback for now)
+            # Nugs discovery
            if not show.nugs_link or generic_nugs:
-                nugs_url = discover_nugs_url_fallback(show, venue)
+                nugs_url = discover_nugs_url(show)
                if nugs_url:
                    print(f"✓ Nugs: {date_str} -> {nugs_url}")
                    nugs_found += 1