feat(backend): Add populate_links script

2025-12-21 01:35:55 -08:00 · 2025-12-21 01:35:55 -08:00 · eb83a3b65f
commit eb83a3b65f
parent 8c2f5e3fdd
1 changed files with 163 additions and 0 deletions
--- a/backend/populate_links.py
+++ b/backend/populate_links.py
@ -0,0 +1,163 @@
+
+import requests
+from bs4 import BeautifulSoup
+from sqlmodel import Session, select
+from database import engine
+from models import Show
+import time
+import re
+
+# El Goose API
+API_BASE = "https://elgoose.net/api/v2"
+SITE_BASE = "https://elgoose.net"
+
+def get_shows_from_api():
+    """Fetch all shows with their permalinks from the API"""
+    print("Fetching all shows from API...")
+    url = f"{API_BASE}/shows.json"
+    params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first
+    
+    all_shows = []
+    page = 1
+    while True:
+        params['page'] = page
+        print(f"  Fetching page {page}...", end="", flush=True)
+        try:
+            resp = requests.get(url, params=params)
+            resp.raise_for_status()
+            data = resp.json()
+            if not data or 'data' not in data:
+                print(" Done.")
+                break
+            
+            chunk = data['data']
+            if not chunk:
+                print(" Done.")
+                break
+                
+            all_shows.extend(chunk)
+            print(f" Got {len(chunk)} shows.")
+            page += 1
+            if page > 50: # Safety
+                break 
+        except Exception as e:
+            print(f" Error: {e}")
+            break
+            
+    return all_shows
+
+def scrape_links(permalink):
+    """Scrape Bandcamp and Nugs links from an El Goose show page"""
+    if not permalink:
+        return None, None
+        
+    url = f"{SITE_BASE}/setlists/{permalink}"
+    # Sometimes it might be at root? Try setlists/ first as per observation.
+    
+    try:
+        # print(f"  Scraping {url}...")
+        resp = requests.get(url, timeout=10)
+        if resp.status_code == 404:
+             # Try root
+             url = f"{SITE_BASE}/{permalink}"
+             resp = requests.get(url, timeout=10)
+        
+        if resp.status_code != 200:
+            print(f"  ❌ Failed to fetch {url}: {resp.status_code}")
+            return None, None
+            
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        
+        bandcamp = None
+        nugs = None
+        
+        # Look for links. 
+        # Usually they are in some container or just raw <a> tags
+        # Pattern matching for hrefs
+        
+        for a in soup.find_all('a', href=True):
+            href = a['href']
+            
+            # Bandcamp
+            if 'bandcamp.com' in href:
+                if 'goosetheband.bandcamp.com' in href or 'bandcamp.com/album' in href:
+                    bandcamp = href
+            
+            # Nugs
+            if 'nugs.net' in href:
+                if '/goose-' in href or 'recording' in href:
+                    nugs = href
+                    
+        return bandcamp, nugs
+
+    except Exception as e:
+        print(f"  ⚠️ Scraping error: {e}")
+        return None, None
+
+def main():
+    print("🔗 Starting Link Population Script...")
+    
+    # 1. Fetch API data to get permalinks (since we didn't store them)
+    api_shows = get_shows_from_api()
+    print(f"✓ Found {len(api_shows)} shows in API.")
+    
+    # Create lookup map: Date -> Permalink
+    # Note: API date is "YYYY-MM-DD"
+    date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}
+    
+    with Session(engine) as session:
+        # 2. Get our DB shows
+        db_shows = session.exec(select(Show)).all()
+        print(f"✓ Found {len(db_shows)} shows in DB to check.")
+        
+        updates = 0
+        
+        for show in db_shows:
+            s_date = show.date.strftime("%Y-%m-%d")
+            permalink = date_to_permalink.get(s_date)
+            
+            if not permalink:
+                # print(f"  ⚠️ No permalink found for {s_date}")
+                continue
+            
+            # Skip if we already have both links (optional, but good for speed)
+            if show.bandcamp_link and show.nugs_link:
+                continue
+                
+            print(f"Processing {s_date}...", end="", flush=True)
+            
+            bc_link, nugs_link = scrape_links(permalink)
+            
+            updated = False
+            if bc_link and bc_link != show.bandcamp_link:
+                show.bandcamp_link = bc_link
+                updated = True
+                print(" [BC]", end="")
+            
+            if nugs_link and nugs_link != show.nugs_link:
+                show.nugs_link = nugs_link
+                updated = True
+                print(" [Nugs]", end="")
+                
+            if updated:
+                session.add(show)
+                updates += 1
+                try:
+                    session.commit() # Commit frequently to save progress
+                    session.refresh(show)
+                    print(" ✓")
+                except Exception as e:
+                    print(f" ❌ Save error: {e}")
+            else:
+                print(" (No new links)")
+            
+            # Be nice to the server
+            if updated:
+                time.sleep(1) # Sleep only if we did work
+            else:
+                time.sleep(0.1) 
+
+    print(f"\n🎉 Done! Updated {updates} shows.")
+
+if __name__ == "__main__":
+    main()