elmeg-demo/backend/populate_links.py


import requests
from bs4 import BeautifulSoup
from sqlmodel import Session, select
from database import engine
from models import Show
import time
import re

# El Goose API
API_BASE = "https://elgoose.net/api/v2"
SITE_BASE = "https://elgoose.net"

def get_shows_from_api():
    """Fetch all shows with their permalinks from the API"""
    print("Fetching all shows from API...")
    url = f"{API_BASE}/shows.json"
    params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first

    all_shows = []
    page = 1
    while True:
        params['page'] = page
        print(f"  Fetching page {page}...", end="", flush=True)
        try:
            resp = requests.get(url, params=params)
            resp.raise_for_status()
            data = resp.json()
            if not data or 'data' not in data:
                print(" Done.")
                break

            chunk = data['data']
            if not chunk:
                print(" Done.")
                break

            all_shows.extend(chunk)
            print(f" Got {len(chunk)} shows.")
            page += 1
            if page > 50: # Safety
                break
        except Exception as e:
            print(f" Error: {e}")
            break

    return all_shows

def scrape_links(permalink):
    """Scrape Bandcamp and Nugs links from an El Goose show page"""
    if not permalink:
        return None, None

    url = f"{SITE_BASE}/setlists/{permalink}"
    # Sometimes it might be at root? Try setlists/ first as per observation.

    try:
        # print(f"  Scraping {url}...")
        resp = requests.get(url, timeout=10)
        if resp.status_code == 404:
             # Try root
             url = f"{SITE_BASE}/{permalink}"
             resp = requests.get(url, timeout=10)

        if resp.status_code != 200:
            print(f"  ❌ Failed to fetch {url}: {resp.status_code}")
            return None, None

        soup = BeautifulSoup(resp.text, 'html.parser')

        bandcamp = None
        nugs = None

        # Look for links.
        # Usually they are in some container or just raw <a> tags
        # Pattern matching for hrefs

        for a in soup.find_all('a', href=True):
            href = a['href']

            # Bandcamp
            if 'bandcamp.com' in href:
                if 'goosetheband.bandcamp.com' in href or 'bandcamp.com/album' in href:
                    bandcamp = href

            # Nugs
            if 'nugs.net' in href:
                if '/goose-' in href or 'recording' in href:
                    nugs = href

        return bandcamp, nugs

    except Exception as e:
        print(f"  ⚠️ Scraping error: {e}")
        return None, None

def main():
    print("🔗 Starting Link Population Script...")

    # 1. Fetch API data to get permalinks (since we didn't store them)
    api_shows = get_shows_from_api()
    print(f"✓ Found {len(api_shows)} shows in API.")

    # Create lookup map: Date -> Permalink
    # Note: API date is "YYYY-MM-DD"
    date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}

    with Session(engine) as session:
        # 2. Get our DB shows
        db_shows = session.exec(select(Show)).all()
        print(f"✓ Found {len(db_shows)} shows in DB to check.")

        updates = 0

        for show in db_shows:
            s_date = show.date.strftime("%Y-%m-%d")
            permalink = date_to_permalink.get(s_date)

            if not permalink:
                # print(f"  ⚠️ No permalink found for {s_date}")
                continue

            # Skip if we already have both links (optional, but good for speed)
            if show.bandcamp_link and show.nugs_link:
                continue

            print(f"Processing {s_date}...", end="", flush=True)

            bc_link, nugs_link = scrape_links(permalink)

            updated = False
            if bc_link and bc_link != show.bandcamp_link:
                show.bandcamp_link = bc_link
                updated = True
                print(" [BC]", end="")

            if nugs_link and nugs_link != show.nugs_link:
                show.nugs_link = nugs_link
                updated = True
                print(" [Nugs]", end="")

            if updated:
                session.add(show)
                updates += 1
                try:
                    session.commit() # Commit frequently to save progress
                    session.refresh(show)
                    print(" ✓")
                except Exception as e:
                    print(f" ❌ Save error: {e}")
            else:
                print(" (No new links)")

            # Be nice to the server
            if updated:
                time.sleep(1) # Sleep only if we did work
            else:
                time.sleep(0.1)

    print(f"\n🎉 Done! Updated {updates} shows.")

if __name__ == "__main__":
    main()