diff --git a/backend/discover_audio_links.py b/backend/discover_audio_links.py index 16c0260..43e6486 100644 --- a/backend/discover_audio_links.py +++ b/backend/discover_audio_links.py @@ -73,62 +73,56 @@ def scrape_nugs_catalog() -> dict: Scrape the Nugs Goose catalog page to build a mapping of date -> URL. Returns dict like {"2024-12-31": "https://www.nugs.net/..."} """ - catalog_url = "https://play.nugs.net/artist/461/latest" + catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/" nugs_shows = {} try: - # The Nugs catalog is a SPA, so we need to use their API - # API endpoint for artist shows - api_url = "https://streamapi.nugs.net/api.aspx" - params = { - "method": "catalog.container", - "containerType": "artist", - "containerId": "461", # Goose's artist ID - } - - resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10) + resp = requests.get(catalog_url, headers=HEADERS, timeout=30) if resp.status_code != 200: print(f"Failed to fetch Nugs catalog: {resp.status_code}") return {} - # Parse response to extract shows - data = resp.json() - if "Response" in data and "Shows" in data["Response"]: - for show in data["Response"]["Shows"]: - perfDate = show.get("perfDate", "")[:10] # YYYY-MM-DD - showId = show.get("showID") - if perfDate and showId: - nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html" + html = resp.text + + # Extract URLs and dates using regex + # URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html + url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)' + urls = re.findall(url_pattern, html) + + # Date pattern in URL: MM-DD-YYYY (before mp3-flac...) + for url in set(urls): # deduplicate + # Try to extract date from URL + date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url) + if date_match: + month, day, year = date_match.groups() + date_str = f"{year}-{int(month):02d}-{int(day):02d}" + nugs_shows[date_str] = url + + print(f" Scraped {len(nugs_shows)} Nugs show URLs from catalog") except Exception as e: print(f"Error scraping Nugs: {e}") return nugs_shows -def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None: +# Global cache for Nugs catalog +_nugs_catalog = None + +def get_nugs_catalog(): + """Get or build Nugs catalog cache""" + global _nugs_catalog + if _nugs_catalog is None: + print("Building Nugs catalog from website...") + _nugs_catalog = scrape_nugs_catalog() + return _nugs_catalog + +def discover_nugs_url(show: Show) -> str | None: """ - Fallback: Try constructing Nugs URL patterns if API doesn't work. + Look up Nugs URL from catalog by date. """ - if not venue: - return None - date_str = show.date.strftime("%Y-%m-%d") - date_compact = show.date.strftime("%Y%m%d") - - # Try common Nugs URL patterns - city_slug = slugify_for_bandcamp(venue.city) if venue.city else "" - state = venue.state.lower() if venue.state else "" - - patterns = [ - f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html", - f"https://www.nugs.net/live-download/goose-{date_compact}.html", - ] - - for url in patterns: - if check_url_exists(url): - return url - - return None + catalog = get_nugs_catalog() + return catalog.get(date_str) def main(dry_run: bool = True, limit: int = None): """ @@ -177,9 +171,9 @@ def main(dry_run: bool = True, limit: int = None): else: already_have_bandcamp += 1 - # Nugs discovery (using fallback for now) + # Nugs discovery if not show.nugs_link or generic_nugs: - nugs_url = discover_nugs_url_fallback(show, venue) + nugs_url = discover_nugs_url(show) if nugs_url: print(f"✓ Nugs: {date_str} -> {nugs_url}") nugs_found += 1