diff --git a/backend/populate_links.py b/backend/populate_links.py index 1342d11..b87d8cd 100644 --- a/backend/populate_links.py +++ b/backend/populate_links.py @@ -15,55 +15,42 @@ def get_shows_from_api(): """Fetch all shows with their permalinks from the API""" print("Fetching all shows from API...") url = f"{API_BASE}/shows.json" - params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first + params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} all_shows = [] - page = 1 - while True: - params['page'] = page - print(f" Fetching page {page}...", end="", flush=True) - try: - resp = requests.get(url, params=params) - resp.raise_for_status() - data = resp.json() - if not data or 'data' not in data: - print(" Done.") - break + + # It seems the API might return ALL shows on page 1 if no limit is set. + # We will try fetching page 1. + try: + print(f" Fetching shows...", end="", flush=True) + resp = requests.get(url, params=params) + resp.raise_for_status() + data = resp.json() + + if data and 'data' in data: + all_shows = data['data'] + print(f" Got {len(all_shows)} shows.") + return all_shows - chunk = data['data'] - if not chunk: - print(" Done.") - break - - all_shows.extend(chunk) - print(f" Got {len(chunk)} shows.") - page += 1 - if page > 50: # Safety - break - except Exception as e: - print(f" Error: {e}") - break + except Exception as e: + print(f" Error: {e}") return all_shows -def scrape_links(permalink): +def scrape_links(session_http, permalink): """Scrape Bandcamp and Nugs links from an El Goose show page""" if not permalink: return None, None url = f"{SITE_BASE}/setlists/{permalink}" - # Sometimes it might be at root? Try setlists/ first as per observation. try: - # print(f" Scraping {url}...") - resp = requests.get(url, timeout=10) + resp = session_http.get(url, timeout=10) if resp.status_code == 404: - # Try root url = f"{SITE_BASE}/{permalink}" - resp = requests.get(url, timeout=10) + resp = session_http.get(url, timeout=10) if resp.status_code != 200: - print(f" āŒ Failed to fetch {url}: {resp.status_code}") return None, None soup = BeautifulSoup(resp.text, 'html.parser') @@ -71,10 +58,6 @@ def scrape_links(permalink): bandcamp = None nugs = None - # Look for links. - # Usually they are in some container or just raw tags - # Pattern matching for hrefs - for a in soup.find_all('a', href=True): href = a['href'] @@ -97,36 +80,48 @@ def scrape_links(permalink): def main(): print("šŸ”— Starting Link Population Script...") - # 1. Fetch API data to get permalinks (since we didn't store them) + # 1. Fetch API data api_shows = get_shows_from_api() print(f"āœ“ Found {len(api_shows)} shows in API.") - # Create lookup map: Date -> Permalink - # Note: API date is "YYYY-MM-DD" + if not api_shows: + print("āŒ No shows found in API. Exiting.") + return + date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows} + # Setup HTTP Session + http = requests.Session() + http.headers.update({ + "User-Agent": "ElmegDemoBot/1.0 (+http://elmeg.xyz)" + }) + with Session(engine) as session: # 2. Get our DB shows db_shows = session.exec(select(Show)).all() + # Sort by date desc to update newest first + db_shows.sort(key=lambda x: x.date, reverse=True) + print(f"āœ“ Found {len(db_shows)} shows in DB to check.") updates = 0 + checked = 0 for show in db_shows: + checked += 1 s_date = show.date.strftime("%Y-%m-%d") permalink = date_to_permalink.get(s_date) if not permalink: - # print(f" āš ļø No permalink found for {s_date}") continue - # Skip if we already have both links (optional, but good for speed) + # Skip if we already have both if show.bandcamp_link and show.nugs_link: - continue + continue - print(f"Processing {s_date}...", end="", flush=True) + print(f"[{checked}/{len(db_shows)}] {s_date}...", end="", flush=True) - bc_link, nugs_link = scrape_links(permalink) + bc_link, nugs_link = scrape_links(http, permalink) updated = False if bc_link and bc_link != show.bandcamp_link: @@ -143,19 +138,16 @@ def main(): session.add(show) updates += 1 try: - session.commit() # Commit frequently to save progress + session.commit() session.refresh(show) print(" āœ“") except Exception as e: print(f" āŒ Save error: {e}") else: - print(" (No new links)") + print(" -") - # Be nice to the server - if updated: - time.sleep(1) # Sleep only if we did work - else: - time.sleep(0.1) + # Small delay + time.sleep(0.1) print(f"\nšŸŽ‰ Done! Updated {updates} shows.")