fix: update Nugs discovery to scrape catalog page
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run

This commit is contained in:
fullsizemalt 2025-12-24 14:04:10 -08:00
parent 58edc0e070
commit d54b217264

View file

@ -73,62 +73,56 @@ def scrape_nugs_catalog() -> dict:
Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
"""
catalog_url = "https://play.nugs.net/artist/461/latest"
catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/"
nugs_shows = {}
try:
# The Nugs catalog is a SPA, so we need to use their API
# API endpoint for artist shows
api_url = "https://streamapi.nugs.net/api.aspx"
params = {
"method": "catalog.container",
"containerType": "artist",
"containerId": "461", # Goose's artist ID
}
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
resp = requests.get(catalog_url, headers=HEADERS, timeout=30)
if resp.status_code != 200:
print(f"Failed to fetch Nugs catalog: {resp.status_code}")
return {}
# Parse response to extract shows
data = resp.json()
if "Response" in data and "Shows" in data["Response"]:
for show in data["Response"]["Shows"]:
perfDate = show.get("perfDate", "")[:10] # YYYY-MM-DD
showId = show.get("showID")
if perfDate and showId:
nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html"
html = resp.text
# Extract URLs and dates using regex
# URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html
url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)'
urls = re.findall(url_pattern, html)
# Date pattern in URL: MM-DD-YYYY (before mp3-flac...)
for url in set(urls): # deduplicate
# Try to extract date from URL
date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url)
if date_match:
month, day, year = date_match.groups()
date_str = f"{year}-{int(month):02d}-{int(day):02d}"
nugs_shows[date_str] = url
print(f" Scraped {len(nugs_shows)} Nugs show URLs from catalog")
except Exception as e:
print(f"Error scraping Nugs: {e}")
return nugs_shows
def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None:
"""
Fallback: Try constructing Nugs URL patterns if API doesn't work.
"""
if not venue:
return None
# Global cache for Nugs catalog
_nugs_catalog = None
def get_nugs_catalog():
"""Get or build Nugs catalog cache"""
global _nugs_catalog
if _nugs_catalog is None:
print("Building Nugs catalog from website...")
_nugs_catalog = scrape_nugs_catalog()
return _nugs_catalog
def discover_nugs_url(show: Show) -> str | None:
"""
Look up Nugs URL from catalog by date.
"""
date_str = show.date.strftime("%Y-%m-%d")
date_compact = show.date.strftime("%Y%m%d")
# Try common Nugs URL patterns
city_slug = slugify_for_bandcamp(venue.city) if venue.city else ""
state = venue.state.lower() if venue.state else ""
patterns = [
f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html",
f"https://www.nugs.net/live-download/goose-{date_compact}.html",
]
for url in patterns:
if check_url_exists(url):
return url
return None
catalog = get_nugs_catalog()
return catalog.get(date_str)
def main(dry_run: bool = True, limit: int = None):
"""
@ -177,9 +171,9 @@ def main(dry_run: bool = True, limit: int = None):
else:
already_have_bandcamp += 1
# Nugs discovery (using fallback for now)
# Nugs discovery
if not show.nugs_link or generic_nugs:
nugs_url = discover_nugs_url_fallback(show, venue)
nugs_url = discover_nugs_url(show)
if nugs_url:
print(f"✓ Nugs: {date_str} -> {nugs_url}")
nugs_found += 1