import requests from bs4 import BeautifulSoup from sqlmodel import Session, select from database import engine from models import Show import time import re # El Goose API API_BASE = "https://elgoose.net/api/v2" SITE_BASE = "https://elgoose.net" def get_shows_from_api(): """Fetch all shows with their permalinks from the API""" print("Fetching all shows from API...") url = f"{API_BASE}/shows.json" params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} all_shows = [] # It seems the API might return ALL shows on page 1 if no limit is set. # We will try fetching page 1. try: print(f" Fetching shows...", end="", flush=True) resp = requests.get(url, params=params) resp.raise_for_status() data = resp.json() if data and 'data' in data: all_shows = data['data'] print(f" Got {len(all_shows)} shows.") return all_shows except Exception as e: print(f" Error: {e}") return all_shows def scrape_links(session_http, permalink): """Scrape Bandcamp and Nugs links from an El Goose show page""" if not permalink: return None, None url = f"{SITE_BASE}/setlists/{permalink}" try: resp = session_http.get(url, timeout=10) if resp.status_code == 404: url = f"{SITE_BASE}/{permalink}" resp = session_http.get(url, timeout=10) if resp.status_code != 200: return None, None soup = BeautifulSoup(resp.text, 'html.parser') bandcamp = None nugs = None for a in soup.find_all('a', href=True): href = a['href'] # Bandcamp if 'bandcamp.com' in href: if 'goosetheband.bandcamp.com' in href or 'bandcamp.com/album' in href: bandcamp = href # Nugs if 'nugs.net' in href: if '/goose-' in href or 'recording' in href: nugs = href return bandcamp, nugs except Exception as e: print(f" āš ļø Scraping error: {e}") return None, None def main(): print("šŸ”— Starting Link Population Script...") # 1. Fetch API data api_shows = get_shows_from_api() print(f"āœ“ Found {len(api_shows)} shows in API.") if not api_shows: print("āŒ No shows found in API. Exiting.") return date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows} # Setup HTTP Session http = requests.Session() http.headers.update({ "User-Agent": "ElmegDemoBot/1.0 (+http://elmeg.xyz)" }) with Session(engine) as session: # 2. Get our DB shows db_shows = session.exec(select(Show)).all() # Sort by date desc to update newest first db_shows.sort(key=lambda x: x.date, reverse=True) print(f"āœ“ Found {len(db_shows)} shows in DB to check.") updates = 0 checked = 0 for show in db_shows: checked += 1 s_date = show.date.strftime("%Y-%m-%d") permalink = date_to_permalink.get(s_date) if not permalink: continue # Skip if we already have both if show.bandcamp_link and show.nugs_link: continue print(f"[{checked}/{len(db_shows)}] {s_date}...", end="", flush=True) bc_link, nugs_link = scrape_links(http, permalink) updated = False if bc_link and bc_link != show.bandcamp_link: show.bandcamp_link = bc_link updated = True print(" [BC]", end="") if nugs_link and nugs_link != show.nugs_link: show.nugs_link = nugs_link updated = True print(" [Nugs]", end="") if updated: session.add(show) updates += 1 try: session.commit() session.refresh(show) print(" āœ“") except Exception as e: print(f" āŒ Save error: {e}") else: print(" -") # Small delay time.sleep(0.1) print(f"\nšŸŽ‰ Done! Updated {updates} shows.") if __name__ == "__main__": main()