fix(backend): Optimize populate_links script
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run

This commit is contained in:
fullsizemalt 2025-12-21 01:37:32 -08:00
parent eb83a3b65f
commit 15e99b506a

View file

@ -15,55 +15,42 @@ def get_shows_from_api():
"""Fetch all shows with their permalinks from the API""" """Fetch all shows with their permalinks from the API"""
print("Fetching all shows from API...") print("Fetching all shows from API...")
url = f"{API_BASE}/shows.json" url = f"{API_BASE}/shows.json"
params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first params = {"artist": 1, "order_by": "showdate", "direction": "DESC"}
all_shows = [] all_shows = []
page = 1
while True: # It seems the API might return ALL shows on page 1 if no limit is set.
params['page'] = page # We will try fetching page 1.
print(f" Fetching page {page}...", end="", flush=True)
try: try:
print(f" Fetching shows...", end="", flush=True)
resp = requests.get(url, params=params) resp = requests.get(url, params=params)
resp.raise_for_status() resp.raise_for_status()
data = resp.json() data = resp.json()
if not data or 'data' not in data:
print(" Done.")
break
chunk = data['data'] if data and 'data' in data:
if not chunk: all_shows = data['data']
print(" Done.") print(f" Got {len(all_shows)} shows.")
break return all_shows
all_shows.extend(chunk)
print(f" Got {len(chunk)} shows.")
page += 1
if page > 50: # Safety
break
except Exception as e: except Exception as e:
print(f" Error: {e}") print(f" Error: {e}")
break
return all_shows return all_shows
def scrape_links(permalink): def scrape_links(session_http, permalink):
"""Scrape Bandcamp and Nugs links from an El Goose show page""" """Scrape Bandcamp and Nugs links from an El Goose show page"""
if not permalink: if not permalink:
return None, None return None, None
url = f"{SITE_BASE}/setlists/{permalink}" url = f"{SITE_BASE}/setlists/{permalink}"
# Sometimes it might be at root? Try setlists/ first as per observation.
try: try:
# print(f" Scraping {url}...") resp = session_http.get(url, timeout=10)
resp = requests.get(url, timeout=10)
if resp.status_code == 404: if resp.status_code == 404:
# Try root
url = f"{SITE_BASE}/{permalink}" url = f"{SITE_BASE}/{permalink}"
resp = requests.get(url, timeout=10) resp = session_http.get(url, timeout=10)
if resp.status_code != 200: if resp.status_code != 200:
print(f" ❌ Failed to fetch {url}: {resp.status_code}")
return None, None return None, None
soup = BeautifulSoup(resp.text, 'html.parser') soup = BeautifulSoup(resp.text, 'html.parser')
@ -71,10 +58,6 @@ def scrape_links(permalink):
bandcamp = None bandcamp = None
nugs = None nugs = None
# Look for links.
# Usually they are in some container or just raw <a> tags
# Pattern matching for hrefs
for a in soup.find_all('a', href=True): for a in soup.find_all('a', href=True):
href = a['href'] href = a['href']
@ -97,36 +80,48 @@ def scrape_links(permalink):
def main(): def main():
print("🔗 Starting Link Population Script...") print("🔗 Starting Link Population Script...")
# 1. Fetch API data to get permalinks (since we didn't store them) # 1. Fetch API data
api_shows = get_shows_from_api() api_shows = get_shows_from_api()
print(f"✓ Found {len(api_shows)} shows in API.") print(f"✓ Found {len(api_shows)} shows in API.")
# Create lookup map: Date -> Permalink if not api_shows:
# Note: API date is "YYYY-MM-DD" print("❌ No shows found in API. Exiting.")
return
date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows} date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}
# Setup HTTP Session
http = requests.Session()
http.headers.update({
"User-Agent": "ElmegDemoBot/1.0 (+http://elmeg.xyz)"
})
with Session(engine) as session: with Session(engine) as session:
# 2. Get our DB shows # 2. Get our DB shows
db_shows = session.exec(select(Show)).all() db_shows = session.exec(select(Show)).all()
# Sort by date desc to update newest first
db_shows.sort(key=lambda x: x.date, reverse=True)
print(f"✓ Found {len(db_shows)} shows in DB to check.") print(f"✓ Found {len(db_shows)} shows in DB to check.")
updates = 0 updates = 0
checked = 0
for show in db_shows: for show in db_shows:
checked += 1
s_date = show.date.strftime("%Y-%m-%d") s_date = show.date.strftime("%Y-%m-%d")
permalink = date_to_permalink.get(s_date) permalink = date_to_permalink.get(s_date)
if not permalink: if not permalink:
# print(f" ⚠️ No permalink found for {s_date}")
continue continue
# Skip if we already have both links (optional, but good for speed) # Skip if we already have both
if show.bandcamp_link and show.nugs_link: if show.bandcamp_link and show.nugs_link:
continue continue
print(f"Processing {s_date}...", end="", flush=True) print(f"[{checked}/{len(db_shows)}] {s_date}...", end="", flush=True)
bc_link, nugs_link = scrape_links(permalink) bc_link, nugs_link = scrape_links(http, permalink)
updated = False updated = False
if bc_link and bc_link != show.bandcamp_link: if bc_link and bc_link != show.bandcamp_link:
@ -143,18 +138,15 @@ def main():
session.add(show) session.add(show)
updates += 1 updates += 1
try: try:
session.commit() # Commit frequently to save progress session.commit()
session.refresh(show) session.refresh(show)
print("") print("")
except Exception as e: except Exception as e:
print(f" ❌ Save error: {e}") print(f" ❌ Save error: {e}")
else: else:
print(" (No new links)") print(" -")
# Be nice to the server # Small delay
if updated:
time.sleep(1) # Sleep only if we did work
else:
time.sleep(0.1) time.sleep(0.1)
print(f"\n🎉 Done! Updated {updates} shows.") print(f"\n🎉 Done! Updated {updates} shows.")