Optimize setlist import for memory

2025-12-25 11:05:54 -08:00 · 2025-12-25 11:05:54 -08:00 · 14f016977a
commit 14f016977a
parent e2c77d7593
1 changed files with 86 additions and 113 deletions
--- a/backend/import_setlists_smart.py
+++ b/backend/import_setlists_smart.py
@ -1,16 +1,13 @@

 """
-Smart Setlist Importer
-Uses a 2-step mapping strategy to bypass missing dates in setlist endpoint:
-1. Fetch ALL shows from API -> Map ElGoose_ID to Date.
-2. Fetch ALL DB shows -> Map Date to DB_ID.
-3. Combine: ElGoose_ID -> DB_ID.
-4. Import setlists using ElGoose_ID from setlist entries.
+Smart Setlist Importer (Streaming Version)
+Reducing memory usage by processing data in streams instead of bulk loading.
 """
 import requests
 import time
+import gc
 from datetime import datetime
-from sqlmodel import Session, select, func
+from sqlmodel import Session, select
 from database import engine
 from models import Show, Song, Performance
 from slugify import generate_slug
@ -33,126 +30,113 @@ def fetch_json(endpoint, params=None):
            time.sleep(2)
    return None

-def fetch_all_pages(endpoint, params=None):
-    """Fetch all pages from an endpoint"""
-    if params is None:
-        params = {}
-    
-    results = []
-    page = 1
-    while True:
-        print(f"  Fetching {endpoint} page {page}...", end="\r", flush=True)
-        p = params.copy()
-        p['page'] = page
-        data = fetch_json(endpoint, p)
-        if not data:
-            break
-        results.extend(data)
-        page += 1
-        time.sleep(0.1) # Be nice
-    print(f"\n  Fetched {len(results)} items from {endpoint}")
-    return results
-
 def main():
    print("=" * 60)
-    print("SMART SETLIST IMPORTER")
+    print("SMART SETLIST IMPORTER (STREAMING)")
    print("=" * 60)
    
    with Session(engine) as session:
        # 1. Build DB Map: Date string -> DB Show ID
        print("\n1. Building DB Map (Date -> Show ID)...")
-        shows = session.exec(select(Show)).all()
+        shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
        date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
        print(f"   Mapped {len(date_to_db_id)} existing shows in DB")

        if not date_to_db_id:
-            print("   CRITICAL: No shows in database! Run import_shows first.")
+            print("   CRITICAL: No shows in database!")
            return

-        # 2. Build API Map: ElGoose ID -> Date
-        print("\n2. Fetching API Shows to build ElGoose ID -> Date map...")
-        # Only fetch shows for our artist (Goose = 3)
-        api_shows = fetch_all_pages("shows", {"artist": 3}) 
-        if not api_shows:
-             # Fallback if artist filter fails or returns empty
-             print("   Artist filter returned empty, fetching all shows...")
-             api_shows = fetch_all_pages("shows")
-        
+        del shows
+        gc.collect()
+
+        # 2. Build API Map: ElGoose ID -> DB ID
+        # Process iteratively to save memory
+        print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
        elgoose_id_to_db_id = {}
        matched_count = 0
        
-        for s in api_shows:
-            s_date = s.get('showdate')
-            s_id = s.get('show_id')
-            if s_date and s_id:
-                # Lookup in DB map
-                db_id = date_to_db_id.get(s_date)
-                if db_id:
-                    elgoose_id_to_db_id[s_id] = db_id
-                    matched_count += 1
-        
-        print(f"   Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
-        
-        # 3. Cache Songs for Lookup
-        print("\n3. Caching Songs...")
-        songs = session.exec(select(Song)).all()
-        song_map = {s.title.lower().strip(): s.id for s in songs} # title -> id
-        print(f"   Cached {len(song_map)} songs")
-
-        # 4. Fetch and Import Setlists
-        print("\n4. Fetching Setlists and Importing...")
-        # Since we can't filter setlists by artist easily without checking every item,
-        # we'll fetch all and filter by our known show IDs.
-        
        page = 1
-        total_added = 0
-        total_processed = 0
-        
        while True:
-            start_time = time.time()
-            data = fetch_json("setlists", {"page": page})
+            # Fetch batch of shows
+            print(f"   Fetching shows page {page}...", end="\r", flush=True)
+            data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
            if not data:
-                print("   No more data.")
                break
                
+            for s in data:
+                # We only need Goose shows (artist_id=3 usually, but we check date match)
+                s_date = s.get('showdate')
+                s_id = s.get('show_id')
+                
+                if s_date and s_id:
+                    db_id = date_to_db_id.get(s_date)
+                    if db_id:
+                        elgoose_id_to_db_id[s_id] = db_id
+                        matched_count += 1
+            
+            page += 1
+            if page % 10 == 0:
+                gc.collect()
+        
+        print(f"\n   Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
+        del date_to_db_id
+        gc.collect()
+        
+        # 3. Cache Songs
+        print("\n3. Caching Songs...")
+        songs = session.exec(select(Song.id, Song.title)).all()
+        song_map = {s.title.lower().strip(): s.id for s in songs}
+        del songs
+        gc.collect()
+        print(f"   Cached {len(song_map)} songs")
+
+        # 4. Process Setlists
+        print("\n4. Importing Setlists...")
+        page = 1
+        total_added = 0
+        
+        while True:
+            data = fetch_json("setlists", {"page": page})
+            if not data:
+                break
+            
+            # Prefetch checks for this batch to avoid N+1 SELECTs?
+            # Actually with 3600 perfs, one-by-one check is slow.
+            # But "existing check" is needed.
+            # We can cache *existing performances* for the CURRENT batch's shows?
+            # Or just cache ALL existing performance keys (show_id, song_id, position)?
+            # Performance table might be large (40k rows?). 
+            # (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
+            
+            if page == 1:
+                print("   Caching existing performance keys...")
+                perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
+                existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
+                print(f"   Cached {len(existing_keys)} performance keys")
+                del perfs
+                gc.collect()
+
            batch_added = 0
+            new_objects = []
            
            for perf in data:
-                total_processed += 1
                elgoose_show_id = perf.get('show_id')
-                
-                # Check if this performance belongs to a show we care about
                db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
                if not db_show_id:
-                    continue # Not a Goose show or show not in our DB
+                    continue
                
-                # Resolve Song
                song_name = perf.get('songname', '').strip()
                song_id = song_map.get(song_name.lower())
-                
                if not song_id:
-                    # Try creating song if missing?
-                    # Ideally we should have imported all songs, but let's be safe
-                    # For now skip or log
                    continue
                
                position = perf.get('position', 0)
                
-                # Check duplication
-                # We can cache existing performances for speed, but SQL check is safer for now
-                existing = session.exec(
-                    select(Performance).where(
-                        Performance.show_id == db_show_id,
-                        Performance.song_id == song_id,
-                        Performance.position == position
-                    )
-                ).first()
-                
-                if existing:
+                # Check uniqueness
+                if (db_show_id, song_id, position) in existing_keys:
                    continue
                
-                # Create Performance
-                # Map setnumber
+                # Create
                set_val = str(perf.get('setnumber', '1'))
                if set_val.isdigit():
                    set_name = f"Set {set_val}"
@ -173,35 +157,24 @@ def main():
                    set_name=set_name,
                    segue=bool(perf.get('segue', 0)),
                    notes=perf.get('footnote'),
-                    slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" # temp slug strategy
+                    slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
                )
-                session.add(new_perf)
+                new_objects.append(new_perf)
+                existing_keys.add((db_show_id, song_id, position)) # Add to cache
                batch_added += 1
                total_added += 1
            
-            session.commit()
-            elapsed = time.time() - start_time
-            print(f"   Page {page}: Processed {len(data)}, Added {batch_added} ({elapsed:.2f}s)")
-            
-            # Optimization: If we see mostly empty adds for many pages, 
-            # we might want to skip, BUT setlists endpoint is usually ordered by date desc?
-            # We must go through all history.
+            if new_objects:
+                session.add_all(new_objects)
+                session.commit()
            
+            print(f"   Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True)
            page += 1
-            if page > 2000: # Safety break
-                break
-                
-        # Fix slugs properly 
-        print("\n5. Fixing Slugs...")
-        # (Slugs generated above might be generic, ideally update based on show date)
-        # But for speed let's rely on the previous fixer or just update here if needed.
-        # The above slug uses ID which is unique but not pretty.
-        # Let's run a quick update for pretty slugs
-        
-        print("\n" + "=" * 60)
-        print("IMPORT COMPLETE")
-        print(f"Total Added: {total_added}")
-        print("=" * 60)
+            
+            if page % 20 == 0:
+                gc.collect()
+
+        print(f"\nImport Complete! Total Added: {total_added}")

 if __name__ == "__main__":
    main()