diff --git a/backend/populate_links.py b/backend/populate_links.py new file mode 100644 index 0000000..1342d11 --- /dev/null +++ b/backend/populate_links.py @@ -0,0 +1,163 @@ + +import requests +from bs4 import BeautifulSoup +from sqlmodel import Session, select +from database import engine +from models import Show +import time +import re + +# El Goose API +API_BASE = "https://elgoose.net/api/v2" +SITE_BASE = "https://elgoose.net" + +def get_shows_from_api(): + """Fetch all shows with their permalinks from the API""" + print("Fetching all shows from API...") + url = f"{API_BASE}/shows.json" + params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first + + all_shows = [] + page = 1 + while True: + params['page'] = page + print(f" Fetching page {page}...", end="", flush=True) + try: + resp = requests.get(url, params=params) + resp.raise_for_status() + data = resp.json() + if not data or 'data' not in data: + print(" Done.") + break + + chunk = data['data'] + if not chunk: + print(" Done.") + break + + all_shows.extend(chunk) + print(f" Got {len(chunk)} shows.") + page += 1 + if page > 50: # Safety + break + except Exception as e: + print(f" Error: {e}") + break + + return all_shows + +def scrape_links(permalink): + """Scrape Bandcamp and Nugs links from an El Goose show page""" + if not permalink: + return None, None + + url = f"{SITE_BASE}/setlists/{permalink}" + # Sometimes it might be at root? Try setlists/ first as per observation. + + try: + # print(f" Scraping {url}...") + resp = requests.get(url, timeout=10) + if resp.status_code == 404: + # Try root + url = f"{SITE_BASE}/{permalink}" + resp = requests.get(url, timeout=10) + + if resp.status_code != 200: + print(f" āŒ Failed to fetch {url}: {resp.status_code}") + return None, None + + soup = BeautifulSoup(resp.text, 'html.parser') + + bandcamp = None + nugs = None + + # Look for links. + # Usually they are in some container or just raw tags + # Pattern matching for hrefs + + for a in soup.find_all('a', href=True): + href = a['href'] + + # Bandcamp + if 'bandcamp.com' in href: + if 'goosetheband.bandcamp.com' in href or 'bandcamp.com/album' in href: + bandcamp = href + + # Nugs + if 'nugs.net' in href: + if '/goose-' in href or 'recording' in href: + nugs = href + + return bandcamp, nugs + + except Exception as e: + print(f" āš ļø Scraping error: {e}") + return None, None + +def main(): + print("šŸ”— Starting Link Population Script...") + + # 1. Fetch API data to get permalinks (since we didn't store them) + api_shows = get_shows_from_api() + print(f"āœ“ Found {len(api_shows)} shows in API.") + + # Create lookup map: Date -> Permalink + # Note: API date is "YYYY-MM-DD" + date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows} + + with Session(engine) as session: + # 2. Get our DB shows + db_shows = session.exec(select(Show)).all() + print(f"āœ“ Found {len(db_shows)} shows in DB to check.") + + updates = 0 + + for show in db_shows: + s_date = show.date.strftime("%Y-%m-%d") + permalink = date_to_permalink.get(s_date) + + if not permalink: + # print(f" āš ļø No permalink found for {s_date}") + continue + + # Skip if we already have both links (optional, but good for speed) + if show.bandcamp_link and show.nugs_link: + continue + + print(f"Processing {s_date}...", end="", flush=True) + + bc_link, nugs_link = scrape_links(permalink) + + updated = False + if bc_link and bc_link != show.bandcamp_link: + show.bandcamp_link = bc_link + updated = True + print(" [BC]", end="") + + if nugs_link and nugs_link != show.nugs_link: + show.nugs_link = nugs_link + updated = True + print(" [Nugs]", end="") + + if updated: + session.add(show) + updates += 1 + try: + session.commit() # Commit frequently to save progress + session.refresh(show) + print(" āœ“") + except Exception as e: + print(f" āŒ Save error: {e}") + else: + print(" (No new links)") + + # Be nice to the server + if updated: + time.sleep(1) # Sleep only if we did work + else: + time.sleep(0.1) + + print(f"\nšŸŽ‰ Done! Updated {updates} shows.") + +if __name__ == "__main__": + main()