feat: add audio link discovery script
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
1328cc458f
commit
58edc0e070
1 changed files with 219 additions and 0 deletions
219
backend/discover_audio_links.py
Normal file
219
backend/discover_audio_links.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
"""
|
||||
Show Audio Link Discovery Script
|
||||
|
||||
This script discovers show-specific Bandcamp and Nugs.net URLs by:
|
||||
1. For Bandcamp: Testing URL patterns based on date and venue
|
||||
2. For Nugs: Scraping the Nugs catalog page to extract show URLs
|
||||
|
||||
Run from backend container: python discover_audio_links.py
|
||||
"""
|
||||
import re
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from sqlmodel import Session, select
|
||||
from database import engine
|
||||
from models import Show, Venue
|
||||
from slugify import generate_slug
|
||||
|
||||
# Session headers to mimic browser
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
def slugify_for_bandcamp(text: str) -> str:
|
||||
"""Convert text to Bandcamp-friendly slug (lowercase, hyphens, no special chars)"""
|
||||
text = text.lower()
|
||||
text = re.sub(r'[^a-z0-9\s-]', '', text)
|
||||
text = re.sub(r'[\s_]+', '-', text)
|
||||
text = re.sub(r'-+', '-', text)
|
||||
return text.strip('-')
|
||||
|
||||
def check_url_exists(url: str) -> bool:
|
||||
"""Check if a URL returns 200 OK"""
|
||||
try:
|
||||
resp = requests.head(url, headers=HEADERS, timeout=5, allow_redirects=True)
|
||||
return resp.status_code == 200
|
||||
except:
|
||||
return False
|
||||
|
||||
def discover_bandcamp_url(show: Show, venue: Venue) -> str | None:
|
||||
"""
|
||||
Try to find Bandcamp album URL for a show.
|
||||
Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-venue-city-state
|
||||
"""
|
||||
if not venue:
|
||||
return None
|
||||
|
||||
date_str = show.date.strftime("%Y-%m-%d")
|
||||
|
||||
# Build venue slug variations
|
||||
venue_slugs = []
|
||||
|
||||
# Try: venue-city-state
|
||||
if venue.city and venue.state:
|
||||
venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}")
|
||||
|
||||
# Try: city-state (some albums just use location)
|
||||
if venue.city and venue.state:
|
||||
venue_slugs.append(f"{slugify_for_bandcamp(venue.city)}-{venue.state.lower()}")
|
||||
|
||||
# Try: venue-city
|
||||
if venue.city:
|
||||
venue_slugs.append(f"{slugify_for_bandcamp(venue.name)}-{slugify_for_bandcamp(venue.city)}")
|
||||
|
||||
for venue_slug in venue_slugs:
|
||||
url = f"https://goosetheband.bandcamp.com/album/{date_str}-{venue_slug}"
|
||||
if check_url_exists(url):
|
||||
return url
|
||||
|
||||
return None
|
||||
|
||||
def scrape_nugs_catalog() -> dict:
|
||||
"""
|
||||
Scrape the Nugs Goose catalog page to build a mapping of date -> URL.
|
||||
Returns dict like {"2024-12-31": "https://www.nugs.net/..."}
|
||||
"""
|
||||
catalog_url = "https://play.nugs.net/artist/461/latest"
|
||||
nugs_shows = {}
|
||||
|
||||
try:
|
||||
# The Nugs catalog is a SPA, so we need to use their API
|
||||
# API endpoint for artist shows
|
||||
api_url = "https://streamapi.nugs.net/api.aspx"
|
||||
params = {
|
||||
"method": "catalog.container",
|
||||
"containerType": "artist",
|
||||
"containerId": "461", # Goose's artist ID
|
||||
}
|
||||
|
||||
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
|
||||
if resp.status_code != 200:
|
||||
print(f"Failed to fetch Nugs catalog: {resp.status_code}")
|
||||
return {}
|
||||
|
||||
# Parse response to extract shows
|
||||
data = resp.json()
|
||||
if "Response" in data and "Shows" in data["Response"]:
|
||||
for show in data["Response"]["Shows"]:
|
||||
perfDate = show.get("perfDate", "")[:10] # YYYY-MM-DD
|
||||
showId = show.get("showID")
|
||||
if perfDate and showId:
|
||||
nugs_shows[perfDate] = f"https://www.nugs.net/live-download/goose-{perfDate.replace('-', '')}-{showId}.html"
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping Nugs: {e}")
|
||||
|
||||
return nugs_shows
|
||||
|
||||
def discover_nugs_url_fallback(show: Show, venue: Venue) -> str | None:
|
||||
"""
|
||||
Fallback: Try constructing Nugs URL patterns if API doesn't work.
|
||||
"""
|
||||
if not venue:
|
||||
return None
|
||||
|
||||
date_str = show.date.strftime("%Y-%m-%d")
|
||||
date_compact = show.date.strftime("%Y%m%d")
|
||||
|
||||
# Try common Nugs URL patterns
|
||||
city_slug = slugify_for_bandcamp(venue.city) if venue.city else ""
|
||||
state = venue.state.lower() if venue.state else ""
|
||||
|
||||
patterns = [
|
||||
f"https://www.nugs.net/live-download/goose-{date_compact}-{city_slug}-{state}.html",
|
||||
f"https://www.nugs.net/live-download/goose-{date_compact}.html",
|
||||
]
|
||||
|
||||
for url in patterns:
|
||||
if check_url_exists(url):
|
||||
return url
|
||||
|
||||
return None
|
||||
|
||||
def main(dry_run: bool = True, limit: int = None):
|
||||
"""
|
||||
Main discovery function.
|
||||
|
||||
Args:
|
||||
dry_run: If True, print changes but don't write to DB
|
||||
limit: Max number of shows to process (for testing)
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Show Audio Link Discovery")
|
||||
print("=" * 60)
|
||||
|
||||
with Session(engine) as session:
|
||||
# Get all shows with venue info
|
||||
query = select(Show).order_by(Show.date.desc())
|
||||
if limit:
|
||||
query = query.limit(limit)
|
||||
|
||||
shows = session.exec(query).all()
|
||||
print(f"Found {len(shows)} shows to process")
|
||||
|
||||
# Stats
|
||||
bandcamp_found = 0
|
||||
nugs_found = 0
|
||||
already_have_bandcamp = 0
|
||||
already_have_nugs = 0
|
||||
|
||||
for show in shows:
|
||||
venue = session.get(Venue, show.venue_id) if show.venue_id else None
|
||||
date_str = show.date.strftime("%Y-%m-%d")
|
||||
|
||||
# Skip if already has specific links (not the generic band-level ones)
|
||||
generic_bandcamp = "goosetheband.bandcamp.com" in (show.bandcamp_link or "") and "/album/" not in (show.bandcamp_link or "")
|
||||
generic_nugs = "utm_source=goose" in (show.nugs_link or "")
|
||||
|
||||
# Bandcamp discovery
|
||||
if not show.bandcamp_link or generic_bandcamp:
|
||||
bc_url = discover_bandcamp_url(show, venue)
|
||||
if bc_url:
|
||||
print(f"✓ Bandcamp: {date_str} -> {bc_url}")
|
||||
bandcamp_found += 1
|
||||
if not dry_run:
|
||||
show.bandcamp_link = bc_url
|
||||
session.add(show)
|
||||
else:
|
||||
already_have_bandcamp += 1
|
||||
|
||||
# Nugs discovery (using fallback for now)
|
||||
if not show.nugs_link or generic_nugs:
|
||||
nugs_url = discover_nugs_url_fallback(show, venue)
|
||||
if nugs_url:
|
||||
print(f"✓ Nugs: {date_str} -> {nugs_url}")
|
||||
nugs_found += 1
|
||||
if not dry_run:
|
||||
show.nugs_link = nugs_url
|
||||
session.add(show)
|
||||
else:
|
||||
already_have_nugs += 1
|
||||
|
||||
if not dry_run:
|
||||
session.commit()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
print(f"Shows processed: {len(shows)}")
|
||||
print(f"Bandcamp links found: {bandcamp_found}")
|
||||
print(f"Nugs links found: {nugs_found}")
|
||||
print(f"Already had Bandcamp: {already_have_bandcamp}")
|
||||
print(f"Already had Nugs: {already_have_nugs}")
|
||||
|
||||
if dry_run:
|
||||
print("\n[DRY RUN - No changes saved. Run with dry_run=False to save]")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
dry_run = True
|
||||
limit = 20 # Test with 20 shows first
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
if "--save" in sys.argv:
|
||||
dry_run = False
|
||||
if "--all" in sys.argv:
|
||||
limit = None
|
||||
|
||||
main(dry_run=dry_run, limit=limit)
|
||||
Loading…
Add table
Reference in a new issue