""" Smart Setlist Importer (Streaming Version) Reducing memory usage by processing data in streams instead of bulk loading. """ import requests import time import gc from datetime import datetime from sqlmodel import Session, select from database import engine from models import Show, Song, Performance from slugify import generate_slug BASE_URL = "https://elgoose.net/api/v2" def fetch_json(endpoint, params=None): """Fetch JSON from El Goose API with retries""" url = f"{BASE_URL}/{endpoint}.json" for attempt in range(3): try: response = requests.get(url, params=params, timeout=30) response.raise_for_status() data = response.json() if data.get('error') == 1: return None return data.get('data', []) except Exception as e: print(f" Error fetching {endpoint} (attempt {attempt+1}): {e}") time.sleep(2) return None def main(): print("=" * 60) print("SMART SETLIST IMPORTER (STREAMING)") print("=" * 60) with Session(engine) as session: # 1. Build DB Map: Date string -> DB Show ID print("\n1. Building DB Map (Date -> Show ID)...") shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows} print(f" Mapped {len(date_to_db_id)} existing shows in DB") if not date_to_db_id: print(" CRITICAL: No shows in database!") return del shows gc.collect() # 2. Build API Map: ElGoose ID -> DB ID # Process iteratively to save memory print("\n2. Building ElGoose ID -> DB ID map (Streaming)...") elgoose_id_to_db_id = {} matched_count = 0 page = 1 seen_ids_in_run = set() while True: # Fetch batch of shows print(f" Fetching shows page {page}...", end="\r", flush=True) data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky) if not data: break # Check for API loop (if Page X returns same content as Page 1) first_id_in_batch = data[0].get('show_id') if data else None if first_id_in_batch and first_id_in_batch in seen_ids_in_run: print(f"\n Loop detected at page {page} (ID {first_id_in_batch} seen before). Breaking.") break for s in data: # We only need Goose shows (artist_id=3 usually, but we check date match) s_date = s.get('showdate') s_id = s.get('show_id') if s_id: seen_ids_in_run.add(s_id) if s_date and s_id: db_id = date_to_db_id.get(s_date) if db_id: elgoose_id_to_db_id[s_id] = db_id matched_count += 1 page += 1 if page % 10 == 0: gc.collect() print(f"\n Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs") del date_to_db_id gc.collect() # 3. Cache Songs print("\n3. Caching Songs...") songs = session.exec(select(Song.id, Song.title)).all() song_map = {s.title.lower().strip(): s.id for s in songs} del songs gc.collect() print(f" Cached {len(song_map)} songs") # 4. Process Setlists print("\n4. Importing Setlists...") page = 1 total_added = 0 while True: data = fetch_json("setlists", {"page": page}) if not data: break # Prefetch checks for this batch to avoid N+1 SELECTs? # Actually with 3600 perfs, one-by-one check is slow. # But "existing check" is needed. # We can cache *existing performances* for the CURRENT batch's shows? # Or just cache ALL existing performance keys (show_id, song_id, position)? # Performance table might be large (40k rows?). # (show_id, song_id, position) tuples set is ~2MB RAM. Safe. if page == 1: print(" Caching existing performance keys...") perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all() existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs) print(f" Cached {len(existing_keys)} performance keys") del perfs gc.collect() batch_added = 0 new_objects = [] for perf in data: elgoose_show_id = perf.get('show_id') db_show_id = elgoose_id_to_db_id.get(elgoose_show_id) if not db_show_id: continue song_name = perf.get('songname', '').strip() song_id = song_map.get(song_name.lower()) if not song_id: continue position = perf.get('position', 0) # Check uniqueness if (db_show_id, song_id, position) in existing_keys: continue # Create set_val = str(perf.get('setnumber', '1')) if set_val.isdigit(): set_name = f"Set {set_val}" elif set_val.lower() == 'e': set_name = "Encore" elif set_val.lower() == 'e2': set_name = "Encore 2" elif set_val.lower() == 's': set_name = "Soundcheck" else: set_name = f"Set {set_val}" new_perf = Performance( show_id=db_show_id, song_id=song_id, position=position, set_name=set_name, segue=bool(perf.get('segue', 0)), notes=perf.get('footnote'), slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" ) new_objects.append(new_perf) existing_keys.add((db_show_id, song_id, position)) # Add to cache batch_added += 1 total_added += 1 if new_objects: session.add_all(new_objects) session.commit() print(f" Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True) page += 1 if page % 20 == 0: gc.collect() print(f"\nImport Complete! Total Added: {total_added}") if __name__ == "__main__": main()