diff --git a/backend/import_setlists_smart.py b/backend/import_setlists_smart.py index e3d677c..2920cf6 100644 --- a/backend/import_setlists_smart.py +++ b/backend/import_setlists_smart.py @@ -1,16 +1,13 @@ """ -Smart Setlist Importer -Uses a 2-step mapping strategy to bypass missing dates in setlist endpoint: -1. Fetch ALL shows from API -> Map ElGoose_ID to Date. -2. Fetch ALL DB shows -> Map Date to DB_ID. -3. Combine: ElGoose_ID -> DB_ID. -4. Import setlists using ElGoose_ID from setlist entries. +Smart Setlist Importer (Streaming Version) +Reducing memory usage by processing data in streams instead of bulk loading. """ import requests import time +import gc from datetime import datetime -from sqlmodel import Session, select, func +from sqlmodel import Session, select from database import engine from models import Show, Song, Performance from slugify import generate_slug @@ -33,126 +30,113 @@ def fetch_json(endpoint, params=None): time.sleep(2) return None -def fetch_all_pages(endpoint, params=None): - """Fetch all pages from an endpoint""" - if params is None: - params = {} - - results = [] - page = 1 - while True: - print(f" Fetching {endpoint} page {page}...", end="\r", flush=True) - p = params.copy() - p['page'] = page - data = fetch_json(endpoint, p) - if not data: - break - results.extend(data) - page += 1 - time.sleep(0.1) # Be nice - print(f"\n Fetched {len(results)} items from {endpoint}") - return results - def main(): print("=" * 60) - print("SMART SETLIST IMPORTER") + print("SMART SETLIST IMPORTER (STREAMING)") print("=" * 60) with Session(engine) as session: # 1. Build DB Map: Date string -> DB Show ID print("\n1. Building DB Map (Date -> Show ID)...") - shows = session.exec(select(Show)).all() + shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows} print(f" Mapped {len(date_to_db_id)} existing shows in DB") if not date_to_db_id: - print(" CRITICAL: No shows in database! Run import_shows first.") + print(" CRITICAL: No shows in database!") return - # 2. Build API Map: ElGoose ID -> Date - print("\n2. Fetching API Shows to build ElGoose ID -> Date map...") - # Only fetch shows for our artist (Goose = 3) - api_shows = fetch_all_pages("shows", {"artist": 3}) - if not api_shows: - # Fallback if artist filter fails or returns empty - print(" Artist filter returned empty, fetching all shows...") - api_shows = fetch_all_pages("shows") - + del shows + gc.collect() + + # 2. Build API Map: ElGoose ID -> DB ID + # Process iteratively to save memory + print("\n2. Building ElGoose ID -> DB ID map (Streaming)...") elgoose_id_to_db_id = {} matched_count = 0 - for s in api_shows: - s_date = s.get('showdate') - s_id = s.get('show_id') - if s_date and s_id: - # Lookup in DB map - db_id = date_to_db_id.get(s_date) - if db_id: - elgoose_id_to_db_id[s_id] = db_id - matched_count += 1 - - print(f" Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs") - - # 3. Cache Songs for Lookup - print("\n3. Caching Songs...") - songs = session.exec(select(Song)).all() - song_map = {s.title.lower().strip(): s.id for s in songs} # title -> id - print(f" Cached {len(song_map)} songs") - - # 4. Fetch and Import Setlists - print("\n4. Fetching Setlists and Importing...") - # Since we can't filter setlists by artist easily without checking every item, - # we'll fetch all and filter by our known show IDs. - page = 1 - total_added = 0 - total_processed = 0 - while True: - start_time = time.time() - data = fetch_json("setlists", {"page": page}) + # Fetch batch of shows + print(f" Fetching shows page {page}...", end="\r", flush=True) + data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky) if not data: - print(" No more data.") break + for s in data: + # We only need Goose shows (artist_id=3 usually, but we check date match) + s_date = s.get('showdate') + s_id = s.get('show_id') + + if s_date and s_id: + db_id = date_to_db_id.get(s_date) + if db_id: + elgoose_id_to_db_id[s_id] = db_id + matched_count += 1 + + page += 1 + if page % 10 == 0: + gc.collect() + + print(f"\n Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs") + del date_to_db_id + gc.collect() + + # 3. Cache Songs + print("\n3. Caching Songs...") + songs = session.exec(select(Song.id, Song.title)).all() + song_map = {s.title.lower().strip(): s.id for s in songs} + del songs + gc.collect() + print(f" Cached {len(song_map)} songs") + + # 4. Process Setlists + print("\n4. Importing Setlists...") + page = 1 + total_added = 0 + + while True: + data = fetch_json("setlists", {"page": page}) + if not data: + break + + # Prefetch checks for this batch to avoid N+1 SELECTs? + # Actually with 3600 perfs, one-by-one check is slow. + # But "existing check" is needed. + # We can cache *existing performances* for the CURRENT batch's shows? + # Or just cache ALL existing performance keys (show_id, song_id, position)? + # Performance table might be large (40k rows?). + # (show_id, song_id, position) tuples set is ~2MB RAM. Safe. + + if page == 1: + print(" Caching existing performance keys...") + perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all() + existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs) + print(f" Cached {len(existing_keys)} performance keys") + del perfs + gc.collect() + batch_added = 0 + new_objects = [] for perf in data: - total_processed += 1 elgoose_show_id = perf.get('show_id') - - # Check if this performance belongs to a show we care about db_show_id = elgoose_id_to_db_id.get(elgoose_show_id) if not db_show_id: - continue # Not a Goose show or show not in our DB + continue - # Resolve Song song_name = perf.get('songname', '').strip() song_id = song_map.get(song_name.lower()) - if not song_id: - # Try creating song if missing? - # Ideally we should have imported all songs, but let's be safe - # For now skip or log continue position = perf.get('position', 0) - # Check duplication - # We can cache existing performances for speed, but SQL check is safer for now - existing = session.exec( - select(Performance).where( - Performance.show_id == db_show_id, - Performance.song_id == song_id, - Performance.position == position - ) - ).first() - - if existing: + # Check uniqueness + if (db_show_id, song_id, position) in existing_keys: continue - # Create Performance - # Map setnumber + # Create set_val = str(perf.get('setnumber', '1')) if set_val.isdigit(): set_name = f"Set {set_val}" @@ -173,35 +157,24 @@ def main(): set_name=set_name, segue=bool(perf.get('segue', 0)), notes=perf.get('footnote'), - slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" # temp slug strategy + slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" ) - session.add(new_perf) + new_objects.append(new_perf) + existing_keys.add((db_show_id, song_id, position)) # Add to cache batch_added += 1 total_added += 1 - session.commit() - elapsed = time.time() - start_time - print(f" Page {page}: Processed {len(data)}, Added {batch_added} ({elapsed:.2f}s)") - - # Optimization: If we see mostly empty adds for many pages, - # we might want to skip, BUT setlists endpoint is usually ordered by date desc? - # We must go through all history. + if new_objects: + session.add_all(new_objects) + session.commit() + print(f" Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True) page += 1 - if page > 2000: # Safety break - break - - # Fix slugs properly - print("\n5. Fixing Slugs...") - # (Slugs generated above might be generic, ideally update based on show date) - # But for speed let's rely on the previous fixer or just update here if needed. - # The above slug uses ID which is unique but not pretty. - # Let's run a quick update for pretty slugs - - print("\n" + "=" * 60) - print("IMPORT COMPLETE") - print(f"Total Added: {total_added}") - print("=" * 60) + + if page % 20 == 0: + gc.collect() + + print(f"\nImport Complete! Total Added: {total_added}") if __name__ == "__main__": main()