import requests from bs4 import BeautifulSoup from sqlmodel import Session, select from database import engine from models import Show import time import re # El Goose API API_BASE = "https://elgoose.net/api/v2" SITE_BASE = "https://elgoose.net" def get_shows_from_api(): """Fetch all shows with their permalinks from the API""" print("Fetching all shows from API...") url = f"{API_BASE}/shows.json" params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first all_shows = [] page = 1 while True: params['page'] = page print(f" Fetching page {page}...", end="", flush=True) try: resp = requests.get(url, params=params) resp.raise_for_status() data = resp.json() if not data or 'data' not in data: print(" Done.") break chunk = data['data'] if not chunk: print(" Done.") break all_shows.extend(chunk) print(f" Got {len(chunk)} shows.") page += 1 if page > 50: # Safety break except Exception as e: print(f" Error: {e}") break return all_shows def scrape_links(permalink): """Scrape Bandcamp and Nugs links from an El Goose show page""" if not permalink: return None, None url = f"{SITE_BASE}/setlists/{permalink}" # Sometimes it might be at root? Try setlists/ first as per observation. try: # print(f" Scraping {url}...") resp = requests.get(url, timeout=10) if resp.status_code == 404: # Try root url = f"{SITE_BASE}/{permalink}" resp = requests.get(url, timeout=10) if resp.status_code != 200: print(f" āŒ Failed to fetch {url}: {resp.status_code}") return None, None soup = BeautifulSoup(resp.text, 'html.parser') bandcamp = None nugs = None # Look for links. # Usually they are in some container or just raw tags # Pattern matching for hrefs for a in soup.find_all('a', href=True): href = a['href'] # Bandcamp if 'bandcamp.com' in href: if 'goosetheband.bandcamp.com' in href or 'bandcamp.com/album' in href: bandcamp = href # Nugs if 'nugs.net' in href: if '/goose-' in href or 'recording' in href: nugs = href return bandcamp, nugs except Exception as e: print(f" āš ļø Scraping error: {e}") return None, None def main(): print("šŸ”— Starting Link Population Script...") # 1. Fetch API data to get permalinks (since we didn't store them) api_shows = get_shows_from_api() print(f"āœ“ Found {len(api_shows)} shows in API.") # Create lookup map: Date -> Permalink # Note: API date is "YYYY-MM-DD" date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows} with Session(engine) as session: # 2. Get our DB shows db_shows = session.exec(select(Show)).all() print(f"āœ“ Found {len(db_shows)} shows in DB to check.") updates = 0 for show in db_shows: s_date = show.date.strftime("%Y-%m-%d") permalink = date_to_permalink.get(s_date) if not permalink: # print(f" āš ļø No permalink found for {s_date}") continue # Skip if we already have both links (optional, but good for speed) if show.bandcamp_link and show.nugs_link: continue print(f"Processing {s_date}...", end="", flush=True) bc_link, nugs_link = scrape_links(permalink) updated = False if bc_link and bc_link != show.bandcamp_link: show.bandcamp_link = bc_link updated = True print(" [BC]", end="") if nugs_link and nugs_link != show.nugs_link: show.nugs_link = nugs_link updated = True print(" [Nugs]", end="") if updated: session.add(show) updates += 1 try: session.commit() # Commit frequently to save progress session.refresh(show) print(" āœ“") except Exception as e: print(f" āŒ Save error: {e}") else: print(" (No new links)") # Be nice to the server if updated: time.sleep(1) # Sleep only if we did work else: time.sleep(0.1) print(f"\nšŸŽ‰ Done! Updated {updates} shows.") if __name__ == "__main__": main()