Optimize setlist import for memory

This commit is contained in:
fullsizemalt 2025-12-25 11:05:54 -08:00
parent e2c77d7593
commit 14f016977a

View file

@ -1,16 +1,13 @@
"""
Smart Setlist Importer
Uses a 2-step mapping strategy to bypass missing dates in setlist endpoint:
1. Fetch ALL shows from API -> Map ElGoose_ID to Date.
2. Fetch ALL DB shows -> Map Date to DB_ID.
3. Combine: ElGoose_ID -> DB_ID.
4. Import setlists using ElGoose_ID from setlist entries.
Smart Setlist Importer (Streaming Version)
Reducing memory usage by processing data in streams instead of bulk loading.
"""
import requests
import time
import gc
from datetime import datetime
from sqlmodel import Session, select, func
from sqlmodel import Session, select
from database import engine
from models import Show, Song, Performance
from slugify import generate_slug
@ -33,126 +30,113 @@ def fetch_json(endpoint, params=None):
time.sleep(2)
return None
def fetch_all_pages(endpoint, params=None):
"""Fetch all pages from an endpoint"""
if params is None:
params = {}
results = []
page = 1
while True:
print(f" Fetching {endpoint} page {page}...", end="\r", flush=True)
p = params.copy()
p['page'] = page
data = fetch_json(endpoint, p)
if not data:
break
results.extend(data)
page += 1
time.sleep(0.1) # Be nice
print(f"\n Fetched {len(results)} items from {endpoint}")
return results
def main():
print("=" * 60)
print("SMART SETLIST IMPORTER")
print("SMART SETLIST IMPORTER (STREAMING)")
print("=" * 60)
with Session(engine) as session:
# 1. Build DB Map: Date string -> DB Show ID
print("\n1. Building DB Map (Date -> Show ID)...")
shows = session.exec(select(Show)).all()
shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
print(f" Mapped {len(date_to_db_id)} existing shows in DB")
if not date_to_db_id:
print(" CRITICAL: No shows in database! Run import_shows first.")
print(" CRITICAL: No shows in database!")
return
# 2. Build API Map: ElGoose ID -> Date
print("\n2. Fetching API Shows to build ElGoose ID -> Date map...")
# Only fetch shows for our artist (Goose = 3)
api_shows = fetch_all_pages("shows", {"artist": 3})
if not api_shows:
# Fallback if artist filter fails or returns empty
print(" Artist filter returned empty, fetching all shows...")
api_shows = fetch_all_pages("shows")
del shows
gc.collect()
# 2. Build API Map: ElGoose ID -> DB ID
# Process iteratively to save memory
print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
elgoose_id_to_db_id = {}
matched_count = 0
for s in api_shows:
s_date = s.get('showdate')
s_id = s.get('show_id')
if s_date and s_id:
# Lookup in DB map
db_id = date_to_db_id.get(s_date)
if db_id:
elgoose_id_to_db_id[s_id] = db_id
matched_count += 1
print(f" Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
# 3. Cache Songs for Lookup
print("\n3. Caching Songs...")
songs = session.exec(select(Song)).all()
song_map = {s.title.lower().strip(): s.id for s in songs} # title -> id
print(f" Cached {len(song_map)} songs")
# 4. Fetch and Import Setlists
print("\n4. Fetching Setlists and Importing...")
# Since we can't filter setlists by artist easily without checking every item,
# we'll fetch all and filter by our known show IDs.
page = 1
total_added = 0
total_processed = 0
while True:
start_time = time.time()
data = fetch_json("setlists", {"page": page})
# Fetch batch of shows
print(f" Fetching shows page {page}...", end="\r", flush=True)
data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
if not data:
print(" No more data.")
break
for s in data:
# We only need Goose shows (artist_id=3 usually, but we check date match)
s_date = s.get('showdate')
s_id = s.get('show_id')
if s_date and s_id:
db_id = date_to_db_id.get(s_date)
if db_id:
elgoose_id_to_db_id[s_id] = db_id
matched_count += 1
page += 1
if page % 10 == 0:
gc.collect()
print(f"\n Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
del date_to_db_id
gc.collect()
# 3. Cache Songs
print("\n3. Caching Songs...")
songs = session.exec(select(Song.id, Song.title)).all()
song_map = {s.title.lower().strip(): s.id for s in songs}
del songs
gc.collect()
print(f" Cached {len(song_map)} songs")
# 4. Process Setlists
print("\n4. Importing Setlists...")
page = 1
total_added = 0
while True:
data = fetch_json("setlists", {"page": page})
if not data:
break
# Prefetch checks for this batch to avoid N+1 SELECTs?
# Actually with 3600 perfs, one-by-one check is slow.
# But "existing check" is needed.
# We can cache *existing performances* for the CURRENT batch's shows?
# Or just cache ALL existing performance keys (show_id, song_id, position)?
# Performance table might be large (40k rows?).
# (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
if page == 1:
print(" Caching existing performance keys...")
perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
print(f" Cached {len(existing_keys)} performance keys")
del perfs
gc.collect()
batch_added = 0
new_objects = []
for perf in data:
total_processed += 1
elgoose_show_id = perf.get('show_id')
# Check if this performance belongs to a show we care about
db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
if not db_show_id:
continue # Not a Goose show or show not in our DB
continue
# Resolve Song
song_name = perf.get('songname', '').strip()
song_id = song_map.get(song_name.lower())
if not song_id:
# Try creating song if missing?
# Ideally we should have imported all songs, but let's be safe
# For now skip or log
continue
position = perf.get('position', 0)
# Check duplication
# We can cache existing performances for speed, but SQL check is safer for now
existing = session.exec(
select(Performance).where(
Performance.show_id == db_show_id,
Performance.song_id == song_id,
Performance.position == position
)
).first()
if existing:
# Check uniqueness
if (db_show_id, song_id, position) in existing_keys:
continue
# Create Performance
# Map setnumber
# Create
set_val = str(perf.get('setnumber', '1'))
if set_val.isdigit():
set_name = f"Set {set_val}"
@ -173,35 +157,24 @@ def main():
set_name=set_name,
segue=bool(perf.get('segue', 0)),
notes=perf.get('footnote'),
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" # temp slug strategy
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
)
session.add(new_perf)
new_objects.append(new_perf)
existing_keys.add((db_show_id, song_id, position)) # Add to cache
batch_added += 1
total_added += 1
session.commit()
elapsed = time.time() - start_time
print(f" Page {page}: Processed {len(data)}, Added {batch_added} ({elapsed:.2f}s)")
# Optimization: If we see mostly empty adds for many pages,
# we might want to skip, BUT setlists endpoint is usually ordered by date desc?
# We must go through all history.
if new_objects:
session.add_all(new_objects)
session.commit()
print(f" Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True)
page += 1
if page > 2000: # Safety break
break
# Fix slugs properly
print("\n5. Fixing Slugs...")
# (Slugs generated above might be generic, ideally update based on show date)
# But for speed let's rely on the previous fixer or just update here if needed.
# The above slug uses ID which is unique but not pretty.
# Let's run a quick update for pretty slugs
print("\n" + "=" * 60)
print("IMPORT COMPLETE")
print(f"Total Added: {total_added}")
print("=" * 60)
if page % 20 == 0:
gc.collect()
print(f"\nImport Complete! Total Added: {total_added}")
if __name__ == "__main__":
main()