fix: update Nugs discovery to scrape catalog page
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
58edc0e070
commit
d54b217264
1 changed files with 36 additions and 42 deletions
|
|
@ -73,62 +73,56 @@ def scrape_nugs_catalog() -> dict:
|
||||||
Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
|
Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
|
||||||
Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
|
Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
|
||||||
"""
|
"""
|
||||||
catalog_url = "https://play.nugs.net/artist/461/latest"
|
catalog_url = "https://www.nugs.net/goose-concerts-live-downloads-in-mp3-flac-or-online-music-streaming/"
|
||||||
nugs_shows = {}
|
nugs_shows = {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# The Nugs catalog is a SPA, so we need to use their API
|
resp = requests.get(catalog_url, headers=HEADERS, timeout=30)
|
||||||
# API endpoint for artist shows
|
|
||||||
api_url = "https://streamapi.nugs.net/api.aspx"
|
|
||||||
params = {
|
|
||||||
"method": "catalog.container",
|
|
||||||
"containerType": "artist",
|
|
||||||
"containerId": "461", # Goose's artist ID
|
|
||||||
}
|
|
||||||
|
|
||||||
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
|
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
print(f"Failed to fetch Nugs catalog: {resp.status_code}")
|
print(f"Failed to fetch Nugs catalog: {resp.status_code}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Parse response to extract shows
|
html = resp.text
|
||||||
data = resp.json()
|
|
||||||
if "Response" in data and "Shows" in data["Response"]:
|
# Extract URLs and dates using regex
|
||||||
for show in data["Response"]["Shows"]:
|
# URL pattern: https://www.nugs.net/live-download-of-goose-...-MM-DD-YYYY-mp3-flac-or-online-music-streaming/ID.html
|
||||||
perfDate = show.get("perfDate", "")[:10] # YYYY-MM-DD
|
url_pattern = r'(https://www\.nugs\.net/live-download-of-goose-[^"]+\.html)'
|
||||||
showId = show.get("showID")
|
urls = re.findall(url_pattern, html)
|
||||||
if perfDate and showId:
|
|
||||||
nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html"
|
# Date pattern in URL: MM-DD-YYYY (before mp3-flac...)
|
||||||
|
for url in set(urls): # deduplicate
|
||||||
|
# Try to extract date from URL
|
||||||
|
date_match = re.search(r'-(\d{1,2})-(\d{1,2})-(\d{4})-mp3-flac', url)
|
||||||
|
if date_match:
|
||||||
|
month, day, year = date_match.groups()
|
||||||
|
date_str = f"{year}-{int(month):02d}-{int(day):02d}"
|
||||||
|
nugs_shows[date_str] = url
|
||||||
|
|
||||||
|
print(f" Scraped {len(nugs_shows)} Nugs show URLs from catalog")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error scraping Nugs: {e}")
|
print(f"Error scraping Nugs: {e}")
|
||||||
|
|
||||||
return nugs_shows
|
return nugs_shows
|
||||||
|
|
||||||
def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None:
|
# Global cache for Nugs catalog
|
||||||
|
_nugs_catalog = None
|
||||||
|
|
||||||
|
def get_nugs_catalog():
|
||||||
|
"""Get or build Nugs catalog cache"""
|
||||||
|
global _nugs_catalog
|
||||||
|
if _nugs_catalog is None:
|
||||||
|
print("Building Nugs catalog from website...")
|
||||||
|
_nugs_catalog = scrape_nugs_catalog()
|
||||||
|
return _nugs_catalog
|
||||||
|
|
||||||
|
def discover_nugs_url(show: Show) -> str | None:
|
||||||
"""
|
"""
|
||||||
Fallback: Try constructing Nugs URL patterns if API doesn't work.
|
Look up Nugs URL from catalog by date.
|
||||||
"""
|
"""
|
||||||
if not venue:
|
|
||||||
return None
|
|
||||||
|
|
||||||
date_str = show.date.strftime("%Y-%m-%d")
|
date_str = show.date.strftime("%Y-%m-%d")
|
||||||
date_compact = show.date.strftime("%Y%m%d")
|
catalog = get_nugs_catalog()
|
||||||
|
return catalog.get(date_str)
|
||||||
# Try common Nugs URL patterns
|
|
||||||
city_slug = slugify_for_bandcamp(venue.city) if venue.city else ""
|
|
||||||
state = venue.state.lower() if venue.state else ""
|
|
||||||
|
|
||||||
patterns = [
|
|
||||||
f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html",
|
|
||||||
f"https://www.nugs.net/live-download/goose-{date_compact}.html",
|
|
||||||
]
|
|
||||||
|
|
||||||
for url in patterns:
|
|
||||||
if check_url_exists(url):
|
|
||||||
return url
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def main(dry_run: bool = True, limit: int = None):
|
def main(dry_run: bool = True, limit: int = None):
|
||||||
"""
|
"""
|
||||||
|
|
@ -177,9 +171,9 @@ def main(dry_run: bool = True, limit: int = None):
|
||||||
else:
|
else:
|
||||||
already_have_bandcamp += 1
|
already_have_bandcamp += 1
|
||||||
|
|
||||||
# Nugs discovery (using fallback for now)
|
# Nugs discovery
|
||||||
if not show.nugs_link or generic_nugs:
|
if not show.nugs_link or generic_nugs:
|
||||||
nugs_url = discover_nugs_url_fallback(show, venue)
|
nugs_url = discover_nugs_url(show)
|
||||||
if nugs_url:
|
if nugs_url:
|
||||||
print(f"✓ Nugs: {date_str} -> {nugs_url}")
|
print(f"✓ Nugs: {date_str} -> {nugs_url}")
|
||||||
nugs_found += 1
|
nugs_found += 1
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue