elmeg-demo/backend/import_setlists_smart.py


"""
Smart Setlist Importer (Streaming Version)
Reducing memory usage by processing data in streams instead of bulk loading.
"""
import requests
import time
import gc
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Show, Song, Performance
from slugify import generate_slug

BASE_URL = "https://elgoose.net/api/v2"

def fetch_json(endpoint, params=None):
    """Fetch JSON from El Goose API with retries"""
    url = f"{BASE_URL}/{endpoint}.json"
    for attempt in range(3):
        try:
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            data = response.json()
            if data.get('error') == 1:
                return None
            return data.get('data', [])
        except Exception as e:
            print(f"  Error fetching {endpoint} (attempt {attempt+1}): {e}")
            time.sleep(2)
    return None

def main():
    print("=" * 60)
    print("SMART SETLIST IMPORTER (STREAMING)")
    print("=" * 60)

    with Session(engine) as session:
        # 1. Build DB Map: Date string -> DB Show ID
        print("\n1. Building DB Map (Date -> Show ID)...")
        shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
        date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
        print(f"   Mapped {len(date_to_db_id)} existing shows in DB")

        if not date_to_db_id:
            print("   CRITICAL: No shows in database!")
            return

        del shows
        gc.collect()

        # 2. Build API Map: ElGoose ID -> DB ID
        # Process iteratively to save memory
        print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
        elgoose_id_to_db_id = {}
        matched_count = 0

        page = 1
        seen_ids_in_run = set()

        while True:
            # Fetch batch of shows
            print(f"   Fetching shows page {page}...", end="\r", flush=True)
            data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
            if not data:
                break

            # Check for API loop (if Page X returns same content as Page 1)
            first_id_in_batch = data[0].get('show_id') if data else None
            if first_id_in_batch and first_id_in_batch in seen_ids_in_run:
                print(f"\n   Loop detected at page {page} (ID {first_id_in_batch} seen before). Breaking.")
                break

            for s in data:
                # We only need Goose shows (artist_id=3 usually, but we check date match)
                s_date = s.get('showdate')
                s_id = s.get('show_id')

                if s_id:
                    seen_ids_in_run.add(s_id)

                if s_date and s_id:
                    db_id = date_to_db_id.get(s_date)
                    if db_id:
                        elgoose_id_to_db_id[s_id] = db_id
                        matched_count += 1

            page += 1
            if page % 10 == 0:
                gc.collect()

        print(f"\n   Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
        del date_to_db_id
        gc.collect()

        # 3. Cache Songs
        print("\n3. Caching Songs...")
        songs = session.exec(select(Song.id, Song.title)).all()
        song_map = {s.title.lower().strip(): s.id for s in songs}
        del songs
        gc.collect()
        print(f"   Cached {len(song_map)} songs")

        # 4. Process Setlists
        print("\n4. Importing Setlists...")
        page = 1
        total_added = 0

        while True:
            data = fetch_json("setlists", {"page": page})
            if not data:
                break

            # Prefetch checks for this batch to avoid N+1 SELECTs?
            # Actually with 3600 perfs, one-by-one check is slow.
            # But "existing check" is needed.
            # We can cache *existing performances* for the CURRENT batch's shows?
            # Or just cache ALL existing performance keys (show_id, song_id, position)?
            # Performance table might be large (40k rows?).
            # (show_id, song_id, position) tuples set is ~2MB RAM. Safe.

            if page == 1:
                print("   Caching existing performance keys...")
                perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
                existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
                print(f"   Cached {len(existing_keys)} performance keys")
                del perfs
                gc.collect()

            batch_added = 0
            new_objects = []

            for perf in data:
                elgoose_show_id = perf.get('show_id')
                db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
                if not db_show_id:
                    continue

                song_name = perf.get('songname', '').strip()
                song_id = song_map.get(song_name.lower())
                if not song_id:
                    continue

                position = perf.get('position', 0)

                # Check uniqueness
                if (db_show_id, song_id, position) in existing_keys:
                    continue

                # Create
                set_val = str(perf.get('setnumber', '1'))
                if set_val.isdigit():
                    set_name = f"Set {set_val}"
                elif set_val.lower() == 'e':
                    set_name = "Encore"
                elif set_val.lower() == 'e2':
                    set_name = "Encore 2"
                elif set_val.lower() == 's':
                    set_name = "Soundcheck"
                else:
                    set_name = f"Set {set_val}"


                new_perf = Performance(
                    show_id=db_show_id,
                    song_id=song_id,
                    position=position,
                    set_name=set_name,
                    segue=bool(perf.get('segue', 0)),
                    notes=perf.get('footnote'),
                    slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
                )
                new_objects.append(new_perf)
                existing_keys.add((db_show_id, song_id, position)) # Add to cache
                batch_added += 1
                total_added += 1

            if new_objects:
                session.add_all(new_objects)
                session.commit()

            print(f"   Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True)
            page += 1

            if page % 20 == 0:
                gc.collect()

        print(f"\nImport Complete! Total Added: {total_added}")

if __name__ == "__main__":
    main()