Fix import scripts: proper Goose filtering, loop detection, set name updates

This commit is contained in:
fullsizemalt 2025-12-25 21:49:19 -08:00
parent 29e3e07141
commit 8a46000b9d
4 changed files with 176 additions and 59 deletions

View file

@ -10,6 +10,15 @@ When deploying changes to elmeg, **ONLY rebuild the backend and frontend contain
## Safe deployment command
### Production (`elmeg.xyz`) - tangible-aacorn
```bash
# turbo
ssh tangible-aacorn "cd /srv/containers/elmeg-demo && git pull && docker compose up -d --build --no-deps backend frontend"
```
### Staging (`elmeg.runfoo.run`) - nexus-vector
```bash
# turbo
ssh nexus-vector "cd /srv/containers/elmeg-demo && git pull && docker compose up -d --build --no-deps backend frontend"
@ -36,3 +45,29 @@ ssh nexus-vector "docker exec elmeg-demo-db-1 pg_dump -U elmeg elmeg > /srv/cont
```bash
ssh nexus-vector "cat /srv/containers/elmeg-demo/backup-YYYYMMDD-HHMMSS.sql | docker exec -i elmeg-demo-db-1 psql -U elmeg elmeg"
```
## Data Import (Recovery)
If the database is wiped or fresh, use the Smart Import script to populate shows and setlists. This script is memory-optimized and checks for infinite loops.
### Production (tangible-aacorn)
```bash
ssh tangible-aacorn "docker exec elmeg-backend-1 python import_setlists_smart.py"
```
### Staging (nexus-vector)
```bash
ssh nexus-vector "docker exec elmeg-demo-backend-1 python import_setlists_smart.py"
```
## Git Configuration (Production)
To ensure `git pull` works correctly on production:
```bash
# On nexus-vector
cd /srv/containers/elmeg-demo
git branch --set-upstream-to=origin/main main
```

View file

@ -37,13 +37,12 @@ def main():
print(f"Mapped {len(song_map)} songs")
# Get existing performances
existing = set()
perfs = session.exec(
select(Performance.show_id, Performance.song_id, Performance.position)
).all()
print("Loading existing performances...")
existing_map = {} # (show_id, song_id, position) -> Performance Object
perfs = session.exec(select(Performance)).all()
for p in perfs:
existing.add((p[0], p[1], p[2]))
print(f"Found {len(existing)} existing performances")
existing_map[(p.show_id, p.song_id, p.position)] = p
print(f"Found {len(existing_map)} existing performances")
# We need API show IDs. The ElGoose API shows endpoint returns show_id.
# Let's fetch and correlate by date
@ -51,26 +50,39 @@ def main():
api_shows = {} # date_str -> api_show_id
page = 1
seen_ids = set()
while True:
url = f"{BASE_URL}/shows.json"
try:
resp = requests.get(url, params={"artist": 1, "page": page}, timeout=30)
resp = requests.get(url, params={"page": page}, timeout=30)
data = resp.json().get('data', [])
if not data:
break
# Loop detection
first_id = data[0].get('show_id') if data else None
if first_id in seen_ids:
print(f" Loop detected at page {page}")
break
if first_id:
seen_ids.add(first_id)
for s in data:
# CRITICAL: Only include Goose shows
if s.get('artist') != 'Goose':
continue
date_str = s['showdate']
api_shows[date_str] = s['show_id']
page += 1
if page > 50:
break
except:
except Exception as e:
print(f" Error on page {page}: {e}")
break
print(f"Got {len(api_shows)} API show IDs")
# Now import setlists for each show
total_added = 0
total_updated = 0
processed = 0
for show in shows:
@ -80,13 +92,8 @@ def main():
if not api_show_id:
continue
# Check if we already have performances for this show
existing_for_show = session.exec(
select(Performance).where(Performance.show_id == show.id)
).first()
if existing_for_show:
continue # Skip shows that already have performances
# REMOVED: Skipping logic. We verify everything.
# existing_for_show = ...
# Fetch setlist
setlist = fetch_show_setlist(api_show_id)
@ -94,6 +101,8 @@ def main():
continue
added = 0
updated = 0
for item in setlist:
song_title = item.get('songname', '').lower()
song_id = song_map.get(song_title)
@ -104,28 +113,49 @@ def main():
position = item.get('position', 0)
key = (show.id, song_id, position)
if key in existing:
# Resolve set name
set_val = str(item.get('setnumber', '1'))
if set_val.isdigit():
set_name = f"Set {set_val}"
elif set_val.lower() == 'e':
set_name = "Encore"
elif set_val.lower() == 'e2':
set_name = "Encore 2"
elif set_val.lower() == 's':
set_name = "Soundcheck"
else:
set_name = f"Set {set_val}"
if key in existing_map:
# Update Check
perf = existing_map[key]
if not perf.set_name or perf.set_name != set_name:
perf.set_name = set_name
session.add(perf)
updated += 1
total_updated += 1
continue
# Create New
perf = Performance(
show_id=show.id,
song_id=song_id,
position=position,
set_name=item.get('set'),
set_name=set_name,
segue=bool(item.get('segue', 0)),
notes=item.get('footnote')
)
session.add(perf)
existing.add(key)
existing_map[key] = perf # Add to map to prevent dupes in same run
added += 1
total_added += 1
if added > 0:
if added > 0 or updated > 0:
session.commit()
processed += 1
print(f"Show {date_str}: +{added} songs ({total_added} total)")
print(f"\\n✓ Added {total_added} performances from {processed} shows")
print(f"Show {date_str}: +{added} new, ~{updated} updated")
print(f"\nImport Complete! Added: {total_added}, Updated: {total_updated}")
if __name__ == "__main__":
main()

View file

@ -38,7 +38,7 @@ def main():
with Session(engine) as session:
# 1. Build DB Map: Date string -> DB Show ID
print("\n1. Building DB Map (Date -> Show ID)...")
shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
shows = session.exec(select(Show.id, Show.date)).all()
date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
print(f" Mapped {len(date_to_db_id)} existing shows in DB")
@ -50,40 +50,34 @@ def main():
gc.collect()
# 2. Build API Map: ElGoose ID -> DB ID
# Process iteratively to save memory
print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
elgoose_id_to_db_id = {}
matched_count = 0
page = 1
seen_ids_in_run = set()
seen_show_ids = set()
while True:
# Fetch batch of shows
print(f" Fetching shows page {page}...", end="\r", flush=True)
data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
data = fetch_json("shows", {"page": page})
if not data:
break
# Check for API loop (if Page X returns same content as Page 1)
first_id_in_batch = data[0].get('show_id') if data else None
if first_id_in_batch and first_id_in_batch in seen_ids_in_run:
print(f"\n Loop detected at page {page} (ID {first_id_in_batch} seen before). Breaking.")
# Loop Detection (Shows)
first_id = data[0].get('show_id') if data else None
if first_id and first_id in seen_show_ids:
print(f"\n Loop detected in Shows at page {page} (ID {first_id}). Breaking.")
break
if first_id:
seen_show_ids.add(first_id)
for s in data:
# We only need Goose shows (artist_id=3 usually, but we check date match)
s_date = s.get('showdate')
s_id = s.get('show_id')
if s_id:
seen_ids_in_run.add(s_id)
if s_date and s_id:
db_id = date_to_db_id.get(s_date)
if db_id:
elgoose_id_to_db_id[s_id] = db_id
matched_count += 1
page += 1
if page % 10 == 0:
@ -93,7 +87,7 @@ def main():
del date_to_db_id
gc.collect()
# 3. Cache Songs
# 3. Caching Songs
print("\n3. Caching Songs...")
songs = session.exec(select(Song.id, Song.title)).all()
song_map = {s.title.lower().strip(): s.id for s in songs}
@ -101,32 +95,35 @@ def main():
gc.collect()
print(f" Cached {len(song_map)} songs")
# 4. Process Setlists
# 4. Importing Setlists
print("\n4. Importing Setlists...")
page = 1
total_added = 0
seen_batch_signatures = set()
# Cache existing performance keys (show_id, song_id, position)
print(" Caching existing performance keys...")
perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
print(f" Cached {len(existing_keys)} existing performances")
del perfs
gc.collect()
while True:
data = fetch_json("setlists", {"page": page})
if not data:
break
# Prefetch checks for this batch to avoid N+1 SELECTs?
# Actually with 3600 perfs, one-by-one check is slow.
# But "existing check" is needed.
# We can cache *existing performances* for the CURRENT batch's shows?
# Or just cache ALL existing performance keys (show_id, song_id, position)?
# Performance table might be large (40k rows?).
# (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
# Loop Detection (Setlists)
# Use signature of first item: (uniqueid or show_id+position)
if data:
first = data[0]
signature = f"{first.get('uniqueid')}-{first.get('show_id')}-{first.get('position')}"
if signature in seen_batch_signatures:
print(f"\n Loop detected in Setlists at page {page} (Sig {signature}). Breaking.")
break
seen_batch_signatures.add(signature)
if page == 1:
print(" Caching existing performance keys...")
perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
print(f" Cached {len(existing_keys)} performance keys")
del perfs
gc.collect()
batch_added = 0
new_objects = []
@ -143,11 +140,9 @@ def main():
position = perf.get('position', 0)
# Check uniqueness
if (db_show_id, song_id, position) in existing_keys:
continue
# Create
set_val = str(perf.get('setnumber', '1'))
if set_val.isdigit():
set_name = f"Set {set_val}"
@ -171,7 +166,7 @@ def main():
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
)
new_objects.append(new_perf)
existing_keys.add((db_show_id, song_id, position)) # Add to cache
existing_keys.add((db_show_id, song_id, position))
batch_added += 1
total_added += 1

View file

@ -0,0 +1,57 @@
import pytest
from sqlmodel import Session, SQLModel, create_engine
from models import User, Review, Show, Rating
from schemas import ReviewCreate
from services.gamification import award_xp
from routers.reviews import create_review
from fastapi import HTTPException
# Mock auth
def mock_get_current_user():
return User(id=1, email="test@test.com", hashed_password="pw", is_active=True)
# Setup in-memory DB
sqlite_file_name = "test_review_debug.db"
sqlite_url = f"sqlite:///{sqlite_file_name}"
engine = create_engine(sqlite_url)
def test_repro_review_crash():
SQLModel.metadata.create_all(engine)
with Session(engine) as session:
# Create dummy user and show
user = User(email="test@test.com", hashed_password="pw")
session.add(user)
show = Show(date="2025-01-01", slug="test-show")
session.add(show)
session.commit()
session.refresh(user)
session.refresh(show)
print(f"User ID: {user.id}, Show ID: {show.id}")
# Payload
review_payload = ReviewCreate(
show_id=show.id,
content="Test Review Content",
blurb="Test Blurb",
score=5.0
)
try:
print("Attempting to create review...")
result = create_review(
review=review_payload,
session=session,
current_user=user
)
print("Review created successfully:", result)
except Exception as e:
print(f"\nCRASH DETECTED: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_repro_review_crash()