Fix import scripts: proper Goose filtering, loop detection, set name updates

2025-12-25 21:49:19 -08:00 · 2025-12-25 21:49:19 -08:00 · 8a46000b9d
commit 8a46000b9d
parent 29e3e07141
4 changed files with 176 additions and 59 deletions
--- a/.agent/workflows/deploy.md
+++ b/.agent/workflows/deploy.md
@ -10,6 +10,15 @@ When deploying changes to elmeg, **ONLY rebuild the backend and frontend contain

 ## Safe deployment command

+### Production (`elmeg.xyz`) - tangible-aacorn
+
+```bash
+# turbo
+ssh tangible-aacorn "cd /srv/containers/elmeg-demo && git pull && docker compose up -d --build --no-deps backend frontend"
+```
+
+### Staging (`elmeg.runfoo.run`) - nexus-vector
+
 ```bash
 # turbo
 ssh nexus-vector "cd /srv/containers/elmeg-demo && git pull && docker compose up -d --build --no-deps backend frontend"
@ -36,3 +45,29 @@ ssh nexus-vector "docker exec elmeg-demo-db-1 pg_dump -U elmeg elmeg > /srv/cont
 ```bash
 ssh nexus-vector "cat /srv/containers/elmeg-demo/backup-YYYYMMDD-HHMMSS.sql | docker exec -i elmeg-demo-db-1 psql -U elmeg elmeg"
 ```
+
+## Data Import (Recovery)
+
+If the database is wiped or fresh, use the Smart Import script to populate shows and setlists. This script is memory-optimized and checks for infinite loops.
+
+### Production (tangible-aacorn)
+
+```bash
+ssh tangible-aacorn "docker exec elmeg-backend-1 python import_setlists_smart.py"
+```
+
+### Staging (nexus-vector)
+
+```bash
+ssh nexus-vector "docker exec elmeg-demo-backend-1 python import_setlists_smart.py"
+```
+
+## Git Configuration (Production)
+
+To ensure `git pull` works correctly on production:
+
+```bash
+# On nexus-vector
+cd /srv/containers/elmeg-demo
+git branch --set-upstream-to=origin/main main
+```
--- a/backend/import_per_show.py
+++ b/backend/import_per_show.py
@ -37,13 +37,12 @@ def main():
        print(f"Mapped {len(song_map)} songs")
        
        # Get existing performances
-        existing = set()
-        perfs = session.exec(
-            select(Performance.show_id, Performance.song_id, Performance.position)
-        ).all()
+        print("Loading existing performances...")
+        existing_map = {} # (show_id, song_id, position) -> Performance Object
+        perfs = session.exec(select(Performance)).all()
        for p in perfs:
-            existing.add((p[0], p[1], p[2]))
-        print(f"Found {len(existing)} existing performances")
+            existing_map[(p.show_id, p.song_id, p.position)] = p
+        print(f"Found {len(existing_map)} existing performances")
        
        # We need API show IDs. The ElGoose API shows endpoint returns show_id.
        # Let's fetch and correlate by date
@ -51,26 +50,39 @@ def main():
        api_shows = {}  # date_str -> api_show_id
        
        page = 1
+        seen_ids = set()
        while True:
            url = f"{BASE_URL}/shows.json"
            try:
-                resp = requests.get(url, params={"artist": 1, "page": page}, timeout=30)
+                resp = requests.get(url, params={"page": page}, timeout=30)
                data = resp.json().get('data', [])
                if not data:
                    break
+                
+                # Loop detection
+                first_id = data[0].get('show_id') if data else None
+                if first_id in seen_ids:
+                    print(f"  Loop detected at page {page}")
+                    break
+                if first_id:
+                    seen_ids.add(first_id)
+                
                for s in data:
+                    # CRITICAL: Only include Goose shows
+                    if s.get('artist') != 'Goose':
+                        continue
                    date_str = s['showdate']
                    api_shows[date_str] = s['show_id']
                page += 1
-                if page > 50:
-                    break
-            except:
+            except Exception as e:
+                print(f"  Error on page {page}: {e}")
                break
        
        print(f"Got {len(api_shows)} API show IDs")
        
        # Now import setlists for each show
        total_added = 0
+        total_updated = 0
        processed = 0
        
        for show in shows:
@ -80,13 +92,8 @@ def main():
            if not api_show_id:
                continue
            
-            # Check if we already have performances for this show
-            existing_for_show = session.exec(
-                select(Performance).where(Performance.show_id == show.id)
-            ).first()
-            
-            if existing_for_show:
-                continue  # Skip shows that already have performances
+            # REMOVED: Skipping logic. We verify everything.
+            # existing_for_show = ...
            
            # Fetch setlist
            setlist = fetch_show_setlist(api_show_id)
@ -94,6 +101,8 @@ def main():
                continue
            
            added = 0
+            updated = 0
+            
            for item in setlist:
                song_title = item.get('songname', '').lower()
                song_id = song_map.get(song_title)
@ -104,28 +113,49 @@ def main():
                position = item.get('position', 0)
                key = (show.id, song_id, position)
                
-                if key in existing:
+                # Resolve set name
+                set_val = str(item.get('setnumber', '1'))
+                if set_val.isdigit():
+                    set_name = f"Set {set_val}"
+                elif set_val.lower() == 'e':
+                    set_name = "Encore"
+                elif set_val.lower() == 'e2':
+                    set_name = "Encore 2"
+                elif set_val.lower() == 's':
+                    set_name = "Soundcheck"
+                else:
+                    set_name = f"Set {set_val}"
+                
+                if key in existing_map:
+                    # Update Check
+                    perf = existing_map[key]
+                    if not perf.set_name or perf.set_name != set_name:
+                        perf.set_name = set_name
+                        session.add(perf)
+                        updated += 1
+                        total_updated += 1
                    continue
                
+                # Create New
                perf = Performance(
                    show_id=show.id,
                    song_id=song_id,
                    position=position,
-                    set_name=item.get('set'),
+                    set_name=set_name,
                    segue=bool(item.get('segue', 0)),
                    notes=item.get('footnote')
                )
                session.add(perf)
-                existing.add(key)
+                existing_map[key] = perf # Add to map to prevent dupes in same run
                added += 1
                total_added += 1
            
-            if added > 0:
+            if added > 0 or updated > 0:
                session.commit()
                processed += 1
-                print(f"Show {date_str}: +{added} songs ({total_added} total)")
-        
-        print(f"\\n✓ Added {total_added} performances from {processed} shows")
+                print(f"Show {date_str}: +{added} new, ~{updated} updated")
+
+        print(f"\nImport Complete! Added: {total_added}, Updated: {total_updated}")

 if __name__ == "__main__":
    main()
--- a/backend/import_setlists_smart.py
+++ b/backend/import_setlists_smart.py
@ -38,7 +38,7 @@ def main():
    with Session(engine) as session:
        # 1. Build DB Map: Date string -> DB Show ID
        print("\n1. Building DB Map (Date -> Show ID)...")
-        shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
+        shows = session.exec(select(Show.id, Show.date)).all()
        date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
        print(f"   Mapped {len(date_to_db_id)} existing shows in DB")

@ -50,40 +50,34 @@ def main():
        gc.collect()

        # 2. Build API Map: ElGoose ID -> DB ID
-        # Process iteratively to save memory
        print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
        elgoose_id_to_db_id = {}
-        matched_count = 0
        
        page = 1
-        seen_ids_in_run = set()
+        seen_show_ids = set()
        
        while True:
-            # Fetch batch of shows
            print(f"   Fetching shows page {page}...", end="\r", flush=True)
-            data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
+            data = fetch_json("shows", {"page": page})
            if not data:
                break
            
-            # Check for API loop (if Page X returns same content as Page 1)
-            first_id_in_batch = data[0].get('show_id') if data else None
-            if first_id_in_batch and first_id_in_batch in seen_ids_in_run:
-                print(f"\n   Loop detected at page {page} (ID {first_id_in_batch} seen before). Breaking.")
+            # Loop Detection (Shows)
+            first_id = data[0].get('show_id') if data else None
+            if first_id and first_id in seen_show_ids:
+                print(f"\n   Loop detected in Shows at page {page} (ID {first_id}). Breaking.")
                break
+            if first_id:
+                seen_show_ids.add(first_id)

            for s in data:
-                # We only need Goose shows (artist_id=3 usually, but we check date match)
                s_date = s.get('showdate')
                s_id = s.get('show_id')
                
-                if s_id:
-                    seen_ids_in_run.add(s_id)
-                
                if s_date and s_id:
                    db_id = date_to_db_id.get(s_date)
                    if db_id:
                        elgoose_id_to_db_id[s_id] = db_id
-                        matched_count += 1
            
            page += 1
            if page % 10 == 0:
@ -93,7 +87,7 @@ def main():
        del date_to_db_id
        gc.collect()
        
-        # 3. Cache Songs
+        # 3. Caching Songs
        print("\n3. Caching Songs...")
        songs = session.exec(select(Song.id, Song.title)).all()
        song_map = {s.title.lower().strip(): s.id for s in songs}
@ -101,32 +95,35 @@ def main():
        gc.collect()
        print(f"   Cached {len(song_map)} songs")

-        # 4. Process Setlists
+        # 4. Importing Setlists
        print("\n4. Importing Setlists...")
        page = 1
        total_added = 0
+        seen_batch_signatures = set()
        
+        # Cache existing performance keys (show_id, song_id, position)
+        print("   Caching existing performance keys...")
+        perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
+        existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
+        print(f"   Cached {len(existing_keys)} existing performances")
+        del perfs
+        gc.collect()
+
        while True:
            data = fetch_json("setlists", {"page": page})
            if not data:
                break
            
-            # Prefetch checks for this batch to avoid N+1 SELECTs?
-            # Actually with 3600 perfs, one-by-one check is slow.
-            # But "existing check" is needed.
-            # We can cache *existing performances* for the CURRENT batch's shows?
-            # Or just cache ALL existing performance keys (show_id, song_id, position)?
-            # Performance table might be large (40k rows?). 
-            # (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
+            # Loop Detection (Setlists)
+            # Use signature of first item: (uniqueid or show_id+position)
+            if data:
+                first = data[0]
+                signature = f"{first.get('uniqueid')}-{first.get('show_id')}-{first.get('position')}"
+                if signature in seen_batch_signatures:
+                     print(f"\n   Loop detected in Setlists at page {page} (Sig {signature}). Breaking.")
+                     break
+                seen_batch_signatures.add(signature)
            
-            if page == 1:
-                print("   Caching existing performance keys...")
-                perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
-                existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
-                print(f"   Cached {len(existing_keys)} performance keys")
-                del perfs
-                gc.collect()
-
            batch_added = 0
            new_objects = []
            
@ -143,11 +140,9 @@ def main():
                
                position = perf.get('position', 0)
                
-                # Check uniqueness
                if (db_show_id, song_id, position) in existing_keys:
                    continue
                
-                # Create
                set_val = str(perf.get('setnumber', '1'))
                if set_val.isdigit():
                    set_name = f"Set {set_val}"
@ -171,7 +166,7 @@ def main():
                    slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
                )
                new_objects.append(new_perf)
-                existing_keys.add((db_show_id, song_id, position)) # Add to cache
+                existing_keys.add((db_show_id, song_id, position))
                batch_added += 1
                total_added += 1
            
--- a/backend/repro_review_crash.py
+++ b/backend/repro_review_crash.py
@ -0,0 +1,57 @@
+
+import pytest
+from sqlmodel import Session, SQLModel, create_engine
+from models import User, Review, Show, Rating
+from schemas import ReviewCreate
+from services.gamification import award_xp
+from routers.reviews import create_review
+from fastapi import HTTPException
+
+# Mock auth
+def mock_get_current_user():
+    return User(id=1, email="test@test.com", hashed_password="pw", is_active=True)
+
+# Setup in-memory DB
+sqlite_file_name = "test_review_debug.db"
+sqlite_url = f"sqlite:///{sqlite_file_name}"
+engine = create_engine(sqlite_url)
+
+def test_repro_review_crash():
+    SQLModel.metadata.create_all(engine)
+    
+    with Session(engine) as session:
+        # Create dummy user and show
+        user = User(email="test@test.com", hashed_password="pw")
+        session.add(user)
+        
+        show = Show(date="2025-01-01", slug="test-show")
+        session.add(show)
+        session.commit()
+        session.refresh(user)
+        session.refresh(show)
+        
+        print(f"User ID: {user.id}, Show ID: {show.id}")
+        
+        # Payload
+        review_payload = ReviewCreate(
+            show_id=show.id,
+            content="Test Review Content",
+            blurb="Test Blurb",
+            score=5.0
+        )
+        
+        try:
+            print("Attempting to create review...")
+            result = create_review(
+                review=review_payload,
+                session=session,
+                current_user=user
+            )
+            print("Review created successfully:", result)
+        except Exception as e:
+            print(f"\nCRASH DETECTED: {e}")
+            import traceback
+            traceback.print_exc()
+
+if __name__ == "__main__":
+    test_repro_review_crash()