fix(backend): Optimize populate_links script

2025-12-21 01:37:32 -08:00 · 2025-12-21 01:37:32 -08:00 · 15e99b506a
commit 15e99b506a
parent eb83a3b65f
1 changed files with 43 additions and 51 deletions
--- a/backend/populate_links.py
+++ b/backend/populate_links.py
@ -15,55 +15,42 @@ def get_shows_from_api():
    """Fetch all shows with their permalinks from the API"""
    print("Fetching all shows from API...")
    url = f"{API_BASE}/shows.json"
-    params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first
+    params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} 
    all_shows = []
    page = 1
    while True:
        params['page'] = page
        print(f"  Fetching page {page}...", end="", flush=True)
        try:
            resp = requests.get(url, params=params)
            resp.raise_for_status()
            data = resp.json()
            if not data or 'data' not in data:
                print(" Done.")
                break
-            chunk = data['data']
+    # It seems the API might return ALL shows on page 1 if no limit is set. 
-            if not chunk:
+    # We will try fetching page 1.
-                print(" Done.")
+    try:
-                break
+        print(f"  Fetching shows...", end="", flush=True)
        resp = requests.get(url, params=params)
        resp.raise_for_status()
        data = resp.json()
-            all_shows.extend(chunk)
+        if data and 'data' in data:
-            print(f" Got {len(chunk)} shows.")
+            all_shows = data['data']
-            page += 1
+            print(f" Got {len(all_shows)} shows.")
-            if page > 50: # Safety
+            return all_shows
-                break 
+            
-        except Exception as e:
+    except Exception as e:
-            print(f" Error: {e}")
+        print(f" Error: {e}")
            break
    return all_shows
-def scrape_links(permalink):
+def scrape_links(session_http, permalink):
    """Scrape Bandcamp and Nugs links from an El Goose show page"""
    if not permalink:
        return None, None
    url = f"{SITE_BASE}/setlists/{permalink}"
    # Sometimes it might be at root? Try setlists/ first as per observation.
    try:
-        # print(f"  Scraping {url}...")
+        resp = session_http.get(url, timeout=10)
        resp = requests.get(url, timeout=10)
        if resp.status_code == 404:
             # Try root
             url = f"{SITE_BASE}/{permalink}"
-             resp = requests.get(url, timeout=10)
+             resp = session_http.get(url, timeout=10)
        if resp.status_code != 200:
            print(f"  ❌ Failed to fetch {url}: {resp.status_code}")
            return None, None
        soup = BeautifulSoup(resp.text, 'html.parser')
@ -71,10 +58,6 @@ def scrape_links(permalink):
        bandcamp = None
        nugs = None
        # Look for links. 
        # Usually they are in some container or just raw <a> tags
        # Pattern matching for hrefs
        for a in soup.find_all('a', href=True):
            href = a['href']
@ -97,36 +80,48 @@ def scrape_links(permalink):
 def main():
    print("🔗 Starting Link Population Script...")
-    # 1. Fetch API data to get permalinks (since we didn't store them)
+    # 1. Fetch API data
    api_shows = get_shows_from_api()
    print(f"✓ Found {len(api_shows)} shows in API.")
-    # Create lookup map: Date -> Permalink
+    if not api_shows:
-    # Note: API date is "YYYY-MM-DD"
+        print("❌ No shows found in API. Exiting.")
        return
    date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}
    # Setup HTTP Session
    http = requests.Session()
    http.headers.update({
        "User-Agent": "ElmegDemoBot/1.0 (+http://elmeg.xyz)"
    })
    with Session(engine) as session:
        # 2. Get our DB shows
        db_shows = session.exec(select(Show)).all()
        # Sort by date desc to update newest first
        db_shows.sort(key=lambda x: x.date, reverse=True)
        print(f"✓ Found {len(db_shows)} shows in DB to check.")
        updates = 0
        checked = 0
        for show in db_shows:
            checked += 1
            s_date = show.date.strftime("%Y-%m-%d")
            permalink = date_to_permalink.get(s_date)
            if not permalink:
                # print(f"  ⚠️ No permalink found for {s_date}")
                continue
-            # Skip if we already have both links (optional, but good for speed)
+            # Skip if we already have both
            if show.bandcamp_link and show.nugs_link:
-                continue
+                 continue
-            print(f"Processing {s_date}...", end="", flush=True)
+            print(f"[{checked}/{len(db_shows)}] {s_date}...", end="", flush=True)
-            bc_link, nugs_link = scrape_links(permalink)
+            bc_link, nugs_link = scrape_links(http, permalink)
            updated = False
            if bc_link and bc_link != show.bandcamp_link:
@ -143,19 +138,16 @@ def main():
                session.add(show)
                updates += 1
                try:
-                    session.commit() # Commit frequently to save progress
+                    session.commit()
                    session.refresh(show)
                    print(" ✓")
                except Exception as e:
                    print(f" ❌ Save error: {e}")
            else:
-                print(" (No new links)")
+                print(" -")
-            # Be nice to the server
+            # Small delay
-            if updated:
+            time.sleep(0.1)
                time.sleep(1) # Sleep only if we did work
            else:
                time.sleep(0.1) 
    print(f"\n🎉 Done! Updated {updates} shows.")