feat(backend): Add populate_links script
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
8c2f5e3fdd
commit
eb83a3b65f
1 changed files with 163 additions and 0 deletions
163
backend/populate_links.py
Normal file
163
backend/populate_links.py
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlmodel import Session, select
|
||||
from database import engine
|
||||
from models import Show
|
||||
import time
|
||||
import re
|
||||
|
||||
# El Goose API
|
||||
API_BASE = "https://elgoose.net/api/v2"
|
||||
SITE_BASE = "https://elgoose.net"
|
||||
|
||||
def get_shows_from_api():
|
||||
"""Fetch all shows with their permalinks from the API"""
|
||||
print("Fetching all shows from API...")
|
||||
url = f"{API_BASE}/shows.json"
|
||||
params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first
|
||||
|
||||
all_shows = []
|
||||
page = 1
|
||||
while True:
|
||||
params['page'] = page
|
||||
print(f" Fetching page {page}...", end="", flush=True)
|
||||
try:
|
||||
resp = requests.get(url, params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if not data or 'data' not in data:
|
||||
print(" Done.")
|
||||
break
|
||||
|
||||
chunk = data['data']
|
||||
if not chunk:
|
||||
print(" Done.")
|
||||
break
|
||||
|
||||
all_shows.extend(chunk)
|
||||
print(f" Got {len(chunk)} shows.")
|
||||
page += 1
|
||||
if page > 50: # Safety
|
||||
break
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
break
|
||||
|
||||
return all_shows
|
||||
|
||||
def scrape_links(permalink):
|
||||
"""Scrape Bandcamp and Nugs links from an El Goose show page"""
|
||||
if not permalink:
|
||||
return None, None
|
||||
|
||||
url = f"{SITE_BASE}/setlists/{permalink}"
|
||||
# Sometimes it might be at root? Try setlists/ first as per observation.
|
||||
|
||||
try:
|
||||
# print(f" Scraping {url}...")
|
||||
resp = requests.get(url, timeout=10)
|
||||
if resp.status_code == 404:
|
||||
# Try root
|
||||
url = f"{SITE_BASE}/{permalink}"
|
||||
resp = requests.get(url, timeout=10)
|
||||
|
||||
if resp.status_code != 200:
|
||||
print(f" ❌ Failed to fetch {url}: {resp.status_code}")
|
||||
return None, None
|
||||
|
||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||
|
||||
bandcamp = None
|
||||
nugs = None
|
||||
|
||||
# Look for links.
|
||||
# Usually they are in some container or just raw <a> tags
|
||||
# Pattern matching for hrefs
|
||||
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
|
||||
# Bandcamp
|
||||
if 'bandcamp.com' in href:
|
||||
if 'goosetheband.bandcamp.com' in href or 'bandcamp.com/album' in href:
|
||||
bandcamp = href
|
||||
|
||||
# Nugs
|
||||
if 'nugs.net' in href:
|
||||
if '/goose-' in href or 'recording' in href:
|
||||
nugs = href
|
||||
|
||||
return bandcamp, nugs
|
||||
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Scraping error: {e}")
|
||||
return None, None
|
||||
|
||||
def main():
|
||||
print("🔗 Starting Link Population Script...")
|
||||
|
||||
# 1. Fetch API data to get permalinks (since we didn't store them)
|
||||
api_shows = get_shows_from_api()
|
||||
print(f"✓ Found {len(api_shows)} shows in API.")
|
||||
|
||||
# Create lookup map: Date -> Permalink
|
||||
# Note: API date is "YYYY-MM-DD"
|
||||
date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}
|
||||
|
||||
with Session(engine) as session:
|
||||
# 2. Get our DB shows
|
||||
db_shows = session.exec(select(Show)).all()
|
||||
print(f"✓ Found {len(db_shows)} shows in DB to check.")
|
||||
|
||||
updates = 0
|
||||
|
||||
for show in db_shows:
|
||||
s_date = show.date.strftime("%Y-%m-%d")
|
||||
permalink = date_to_permalink.get(s_date)
|
||||
|
||||
if not permalink:
|
||||
# print(f" ⚠️ No permalink found for {s_date}")
|
||||
continue
|
||||
|
||||
# Skip if we already have both links (optional, but good for speed)
|
||||
if show.bandcamp_link and show.nugs_link:
|
||||
continue
|
||||
|
||||
print(f"Processing {s_date}...", end="", flush=True)
|
||||
|
||||
bc_link, nugs_link = scrape_links(permalink)
|
||||
|
||||
updated = False
|
||||
if bc_link and bc_link != show.bandcamp_link:
|
||||
show.bandcamp_link = bc_link
|
||||
updated = True
|
||||
print(" [BC]", end="")
|
||||
|
||||
if nugs_link and nugs_link != show.nugs_link:
|
||||
show.nugs_link = nugs_link
|
||||
updated = True
|
||||
print(" [Nugs]", end="")
|
||||
|
||||
if updated:
|
||||
session.add(show)
|
||||
updates += 1
|
||||
try:
|
||||
session.commit() # Commit frequently to save progress
|
||||
session.refresh(show)
|
||||
print(" ✓")
|
||||
except Exception as e:
|
||||
print(f" ❌ Save error: {e}")
|
||||
else:
|
||||
print(" (No new links)")
|
||||
|
||||
# Be nice to the server
|
||||
if updated:
|
||||
time.sleep(1) # Sleep only if we did work
|
||||
else:
|
||||
time.sleep(0.1)
|
||||
|
||||
print(f"\n🎉 Done! Updated {updates} shows.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue