elmeg-demo/backend/import_setlists_smart.py
2025-12-25 11:44:49 -08:00

191 lines
6.8 KiB
Python

"""
Smart Setlist Importer (Streaming Version)
Reducing memory usage by processing data in streams instead of bulk loading.
"""
import requests
import time
import gc
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Show, Song, Performance
from slugify import generate_slug
BASE_URL = "https://elgoose.net/api/v2"
def fetch_json(endpoint, params=None):
"""Fetch JSON from El Goose API with retries"""
url = f"{BASE_URL}/{endpoint}.json"
for attempt in range(3):
try:
response = requests.get(url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
if data.get('error') == 1:
return None
return data.get('data', [])
except Exception as e:
print(f" Error fetching {endpoint} (attempt {attempt+1}): {e}")
time.sleep(2)
return None
def main():
print("=" * 60)
print("SMART SETLIST IMPORTER (STREAMING)")
print("=" * 60)
with Session(engine) as session:
# 1. Build DB Map: Date string -> DB Show ID
print("\n1. Building DB Map (Date -> Show ID)...")
shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
print(f" Mapped {len(date_to_db_id)} existing shows in DB")
if not date_to_db_id:
print(" CRITICAL: No shows in database!")
return
del shows
gc.collect()
# 2. Build API Map: ElGoose ID -> DB ID
# Process iteratively to save memory
print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
elgoose_id_to_db_id = {}
matched_count = 0
page = 1
seen_ids_in_run = set()
while True:
# Fetch batch of shows
print(f" Fetching shows page {page}...", end="\r", flush=True)
data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
if not data:
break
# Check for API loop (if Page X returns same content as Page 1)
first_id_in_batch = data[0].get('show_id') if data else None
if first_id_in_batch and first_id_in_batch in seen_ids_in_run:
print(f"\n Loop detected at page {page} (ID {first_id_in_batch} seen before). Breaking.")
break
for s in data:
# We only need Goose shows (artist_id=3 usually, but we check date match)
s_date = s.get('showdate')
s_id = s.get('show_id')
if s_id:
seen_ids_in_run.add(s_id)
if s_date and s_id:
db_id = date_to_db_id.get(s_date)
if db_id:
elgoose_id_to_db_id[s_id] = db_id
matched_count += 1
page += 1
if page % 10 == 0:
gc.collect()
print(f"\n Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
del date_to_db_id
gc.collect()
# 3. Cache Songs
print("\n3. Caching Songs...")
songs = session.exec(select(Song.id, Song.title)).all()
song_map = {s.title.lower().strip(): s.id for s in songs}
del songs
gc.collect()
print(f" Cached {len(song_map)} songs")
# 4. Process Setlists
print("\n4. Importing Setlists...")
page = 1
total_added = 0
while True:
data = fetch_json("setlists", {"page": page})
if not data:
break
# Prefetch checks for this batch to avoid N+1 SELECTs?
# Actually with 3600 perfs, one-by-one check is slow.
# But "existing check" is needed.
# We can cache *existing performances* for the CURRENT batch's shows?
# Or just cache ALL existing performance keys (show_id, song_id, position)?
# Performance table might be large (40k rows?).
# (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
if page == 1:
print(" Caching existing performance keys...")
perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
print(f" Cached {len(existing_keys)} performance keys")
del perfs
gc.collect()
batch_added = 0
new_objects = []
for perf in data:
elgoose_show_id = perf.get('show_id')
db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
if not db_show_id:
continue
song_name = perf.get('songname', '').strip()
song_id = song_map.get(song_name.lower())
if not song_id:
continue
position = perf.get('position', 0)
# Check uniqueness
if (db_show_id, song_id, position) in existing_keys:
continue
# Create
set_val = str(perf.get('setnumber', '1'))
if set_val.isdigit():
set_name = f"Set {set_val}"
elif set_val.lower() == 'e':
set_name = "Encore"
elif set_val.lower() == 'e2':
set_name = "Encore 2"
elif set_val.lower() == 's':
set_name = "Soundcheck"
else:
set_name = f"Set {set_val}"
new_perf = Performance(
show_id=db_show_id,
song_id=song_id,
position=position,
set_name=set_name,
segue=bool(perf.get('segue', 0)),
notes=perf.get('footnote'),
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
)
new_objects.append(new_perf)
existing_keys.add((db_show_id, song_id, position)) # Add to cache
batch_added += 1
total_added += 1
if new_objects:
session.add_all(new_objects)
session.commit()
print(f" Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True)
page += 1
if page % 20 == 0:
gc.collect()
print(f"\nImport Complete! Total Added: {total_added}")
if __name__ == "__main__":
main()