fix(backend): Optimize populate_links script
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
eb83a3b65f
commit
15e99b506a
1 changed files with 43 additions and 51 deletions
|
|
@ -15,55 +15,42 @@ def get_shows_from_api():
|
||||||
"""Fetch all shows with their permalinks from the API"""
|
"""Fetch all shows with their permalinks from the API"""
|
||||||
print("Fetching all shows from API...")
|
print("Fetching all shows from API...")
|
||||||
url = f"{API_BASE}/shows.json"
|
url = f"{API_BASE}/shows.json"
|
||||||
params = {"artist": 1, "order_by": "showdate", "direction": "DESC"} # Get all, newest first
|
params = {"artist": 1, "order_by": "showdate", "direction": "DESC"}
|
||||||
|
|
||||||
all_shows = []
|
all_shows = []
|
||||||
page = 1
|
|
||||||
while True:
|
|
||||||
params['page'] = page
|
|
||||||
print(f" Fetching page {page}...", end="", flush=True)
|
|
||||||
try:
|
|
||||||
resp = requests.get(url, params=params)
|
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.json()
|
|
||||||
if not data or 'data' not in data:
|
|
||||||
print(" Done.")
|
|
||||||
break
|
|
||||||
|
|
||||||
chunk = data['data']
|
# It seems the API might return ALL shows on page 1 if no limit is set.
|
||||||
if not chunk:
|
# We will try fetching page 1.
|
||||||
print(" Done.")
|
try:
|
||||||
break
|
print(f" Fetching shows...", end="", flush=True)
|
||||||
|
resp = requests.get(url, params=params)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
|
||||||
all_shows.extend(chunk)
|
if data and 'data' in data:
|
||||||
print(f" Got {len(chunk)} shows.")
|
all_shows = data['data']
|
||||||
page += 1
|
print(f" Got {len(all_shows)} shows.")
|
||||||
if page > 50: # Safety
|
return all_shows
|
||||||
break
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" Error: {e}")
|
print(f" Error: {e}")
|
||||||
break
|
|
||||||
|
|
||||||
return all_shows
|
return all_shows
|
||||||
|
|
||||||
def scrape_links(permalink):
|
def scrape_links(session_http, permalink):
|
||||||
"""Scrape Bandcamp and Nugs links from an El Goose show page"""
|
"""Scrape Bandcamp and Nugs links from an El Goose show page"""
|
||||||
if not permalink:
|
if not permalink:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
url = f"{SITE_BASE}/setlists/{permalink}"
|
url = f"{SITE_BASE}/setlists/{permalink}"
|
||||||
# Sometimes it might be at root? Try setlists/ first as per observation.
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# print(f" Scraping {url}...")
|
resp = session_http.get(url, timeout=10)
|
||||||
resp = requests.get(url, timeout=10)
|
|
||||||
if resp.status_code == 404:
|
if resp.status_code == 404:
|
||||||
# Try root
|
|
||||||
url = f"{SITE_BASE}/{permalink}"
|
url = f"{SITE_BASE}/{permalink}"
|
||||||
resp = requests.get(url, timeout=10)
|
resp = session_http.get(url, timeout=10)
|
||||||
|
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
print(f" ❌ Failed to fetch {url}: {resp.status_code}")
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||||
|
|
@ -71,10 +58,6 @@ def scrape_links(permalink):
|
||||||
bandcamp = None
|
bandcamp = None
|
||||||
nugs = None
|
nugs = None
|
||||||
|
|
||||||
# Look for links.
|
|
||||||
# Usually they are in some container or just raw <a> tags
|
|
||||||
# Pattern matching for hrefs
|
|
||||||
|
|
||||||
for a in soup.find_all('a', href=True):
|
for a in soup.find_all('a', href=True):
|
||||||
href = a['href']
|
href = a['href']
|
||||||
|
|
||||||
|
|
@ -97,36 +80,48 @@ def scrape_links(permalink):
|
||||||
def main():
|
def main():
|
||||||
print("🔗 Starting Link Population Script...")
|
print("🔗 Starting Link Population Script...")
|
||||||
|
|
||||||
# 1. Fetch API data to get permalinks (since we didn't store them)
|
# 1. Fetch API data
|
||||||
api_shows = get_shows_from_api()
|
api_shows = get_shows_from_api()
|
||||||
print(f"✓ Found {len(api_shows)} shows in API.")
|
print(f"✓ Found {len(api_shows)} shows in API.")
|
||||||
|
|
||||||
# Create lookup map: Date -> Permalink
|
if not api_shows:
|
||||||
# Note: API date is "YYYY-MM-DD"
|
print("❌ No shows found in API. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}
|
date_to_permalink = {s['showdate']: s['permalink'] for s in api_shows}
|
||||||
|
|
||||||
|
# Setup HTTP Session
|
||||||
|
http = requests.Session()
|
||||||
|
http.headers.update({
|
||||||
|
"User-Agent": "ElmegDemoBot/1.0 (+http://elmeg.xyz)"
|
||||||
|
})
|
||||||
|
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
# 2. Get our DB shows
|
# 2. Get our DB shows
|
||||||
db_shows = session.exec(select(Show)).all()
|
db_shows = session.exec(select(Show)).all()
|
||||||
|
# Sort by date desc to update newest first
|
||||||
|
db_shows.sort(key=lambda x: x.date, reverse=True)
|
||||||
|
|
||||||
print(f"✓ Found {len(db_shows)} shows in DB to check.")
|
print(f"✓ Found {len(db_shows)} shows in DB to check.")
|
||||||
|
|
||||||
updates = 0
|
updates = 0
|
||||||
|
checked = 0
|
||||||
|
|
||||||
for show in db_shows:
|
for show in db_shows:
|
||||||
|
checked += 1
|
||||||
s_date = show.date.strftime("%Y-%m-%d")
|
s_date = show.date.strftime("%Y-%m-%d")
|
||||||
permalink = date_to_permalink.get(s_date)
|
permalink = date_to_permalink.get(s_date)
|
||||||
|
|
||||||
if not permalink:
|
if not permalink:
|
||||||
# print(f" ⚠️ No permalink found for {s_date}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip if we already have both links (optional, but good for speed)
|
# Skip if we already have both
|
||||||
if show.bandcamp_link and show.nugs_link:
|
if show.bandcamp_link and show.nugs_link:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"Processing {s_date}...", end="", flush=True)
|
print(f"[{checked}/{len(db_shows)}] {s_date}...", end="", flush=True)
|
||||||
|
|
||||||
bc_link, nugs_link = scrape_links(permalink)
|
bc_link, nugs_link = scrape_links(http, permalink)
|
||||||
|
|
||||||
updated = False
|
updated = False
|
||||||
if bc_link and bc_link != show.bandcamp_link:
|
if bc_link and bc_link != show.bandcamp_link:
|
||||||
|
|
@ -143,19 +138,16 @@ def main():
|
||||||
session.add(show)
|
session.add(show)
|
||||||
updates += 1
|
updates += 1
|
||||||
try:
|
try:
|
||||||
session.commit() # Commit frequently to save progress
|
session.commit()
|
||||||
session.refresh(show)
|
session.refresh(show)
|
||||||
print(" ✓")
|
print(" ✓")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ❌ Save error: {e}")
|
print(f" ❌ Save error: {e}")
|
||||||
else:
|
else:
|
||||||
print(" (No new links)")
|
print(" -")
|
||||||
|
|
||||||
# Be nice to the server
|
# Small delay
|
||||||
if updated:
|
time.sleep(0.1)
|
||||||
time.sleep(1) # Sleep only if we did work
|
|
||||||
else:
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
print(f"\n🎉 Done! Updated {updates} shows.")
|
print(f"\n🎉 Done! Updated {updates} shows.")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue