Optimize setlist import for memory
This commit is contained in:
parent
e2c77d7593
commit
14f016977a
1 changed files with 86 additions and 113 deletions
|
|
@ -1,16 +1,13 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Smart Setlist Importer
|
Smart Setlist Importer (Streaming Version)
|
||||||
Uses a 2-step mapping strategy to bypass missing dates in setlist endpoint:
|
Reducing memory usage by processing data in streams instead of bulk loading.
|
||||||
1. Fetch ALL shows from API -> Map ElGoose_ID to Date.
|
|
||||||
2. Fetch ALL DB shows -> Map Date to DB_ID.
|
|
||||||
3. Combine: ElGoose_ID -> DB_ID.
|
|
||||||
4. Import setlists using ElGoose_ID from setlist entries.
|
|
||||||
"""
|
"""
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
|
import gc
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from sqlmodel import Session, select, func
|
from sqlmodel import Session, select
|
||||||
from database import engine
|
from database import engine
|
||||||
from models import Show, Song, Performance
|
from models import Show, Song, Performance
|
||||||
from slugify import generate_slug
|
from slugify import generate_slug
|
||||||
|
|
@ -33,126 +30,113 @@ def fetch_json(endpoint, params=None):
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def fetch_all_pages(endpoint, params=None):
|
|
||||||
"""Fetch all pages from an endpoint"""
|
|
||||||
if params is None:
|
|
||||||
params = {}
|
|
||||||
|
|
||||||
results = []
|
|
||||||
page = 1
|
|
||||||
while True:
|
|
||||||
print(f" Fetching {endpoint} page {page}...", end="\r", flush=True)
|
|
||||||
p = params.copy()
|
|
||||||
p['page'] = page
|
|
||||||
data = fetch_json(endpoint, p)
|
|
||||||
if not data:
|
|
||||||
break
|
|
||||||
results.extend(data)
|
|
||||||
page += 1
|
|
||||||
time.sleep(0.1) # Be nice
|
|
||||||
print(f"\n Fetched {len(results)} items from {endpoint}")
|
|
||||||
return results
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
print("SMART SETLIST IMPORTER")
|
print("SMART SETLIST IMPORTER (STREAMING)")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
# 1. Build DB Map: Date string -> DB Show ID
|
# 1. Build DB Map: Date string -> DB Show ID
|
||||||
print("\n1. Building DB Map (Date -> Show ID)...")
|
print("\n1. Building DB Map (Date -> Show ID)...")
|
||||||
shows = session.exec(select(Show)).all()
|
shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields
|
||||||
date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
|
date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
|
||||||
print(f" Mapped {len(date_to_db_id)} existing shows in DB")
|
print(f" Mapped {len(date_to_db_id)} existing shows in DB")
|
||||||
|
|
||||||
if not date_to_db_id:
|
if not date_to_db_id:
|
||||||
print(" CRITICAL: No shows in database! Run import_shows first.")
|
print(" CRITICAL: No shows in database!")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 2. Build API Map: ElGoose ID -> Date
|
del shows
|
||||||
print("\n2. Fetching API Shows to build ElGoose ID -> Date map...")
|
gc.collect()
|
||||||
# Only fetch shows for our artist (Goose = 3)
|
|
||||||
api_shows = fetch_all_pages("shows", {"artist": 3})
|
# 2. Build API Map: ElGoose ID -> DB ID
|
||||||
if not api_shows:
|
# Process iteratively to save memory
|
||||||
# Fallback if artist filter fails or returns empty
|
print("\n2. Building ElGoose ID -> DB ID map (Streaming)...")
|
||||||
print(" Artist filter returned empty, fetching all shows...")
|
|
||||||
api_shows = fetch_all_pages("shows")
|
|
||||||
|
|
||||||
elgoose_id_to_db_id = {}
|
elgoose_id_to_db_id = {}
|
||||||
matched_count = 0
|
matched_count = 0
|
||||||
|
|
||||||
for s in api_shows:
|
|
||||||
s_date = s.get('showdate')
|
|
||||||
s_id = s.get('show_id')
|
|
||||||
if s_date and s_id:
|
|
||||||
# Lookup in DB map
|
|
||||||
db_id = date_to_db_id.get(s_date)
|
|
||||||
if db_id:
|
|
||||||
elgoose_id_to_db_id[s_id] = db_id
|
|
||||||
matched_count += 1
|
|
||||||
|
|
||||||
print(f" Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
|
|
||||||
|
|
||||||
# 3. Cache Songs for Lookup
|
|
||||||
print("\n3. Caching Songs...")
|
|
||||||
songs = session.exec(select(Song)).all()
|
|
||||||
song_map = {s.title.lower().strip(): s.id for s in songs} # title -> id
|
|
||||||
print(f" Cached {len(song_map)} songs")
|
|
||||||
|
|
||||||
# 4. Fetch and Import Setlists
|
|
||||||
print("\n4. Fetching Setlists and Importing...")
|
|
||||||
# Since we can't filter setlists by artist easily without checking every item,
|
|
||||||
# we'll fetch all and filter by our known show IDs.
|
|
||||||
|
|
||||||
page = 1
|
page = 1
|
||||||
total_added = 0
|
|
||||||
total_processed = 0
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
start_time = time.time()
|
# Fetch batch of shows
|
||||||
data = fetch_json("setlists", {"page": page})
|
print(f" Fetching shows page {page}...", end="\r", flush=True)
|
||||||
|
data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
|
||||||
if not data:
|
if not data:
|
||||||
print(" No more data.")
|
|
||||||
break
|
break
|
||||||
|
|
||||||
|
for s in data:
|
||||||
|
# We only need Goose shows (artist_id=3 usually, but we check date match)
|
||||||
|
s_date = s.get('showdate')
|
||||||
|
s_id = s.get('show_id')
|
||||||
|
|
||||||
|
if s_date and s_id:
|
||||||
|
db_id = date_to_db_id.get(s_date)
|
||||||
|
if db_id:
|
||||||
|
elgoose_id_to_db_id[s_id] = db_id
|
||||||
|
matched_count += 1
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
if page % 10 == 0:
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
print(f"\n Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
|
||||||
|
del date_to_db_id
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
# 3. Cache Songs
|
||||||
|
print("\n3. Caching Songs...")
|
||||||
|
songs = session.exec(select(Song.id, Song.title)).all()
|
||||||
|
song_map = {s.title.lower().strip(): s.id for s in songs}
|
||||||
|
del songs
|
||||||
|
gc.collect()
|
||||||
|
print(f" Cached {len(song_map)} songs")
|
||||||
|
|
||||||
|
# 4. Process Setlists
|
||||||
|
print("\n4. Importing Setlists...")
|
||||||
|
page = 1
|
||||||
|
total_added = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
data = fetch_json("setlists", {"page": page})
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Prefetch checks for this batch to avoid N+1 SELECTs?
|
||||||
|
# Actually with 3600 perfs, one-by-one check is slow.
|
||||||
|
# But "existing check" is needed.
|
||||||
|
# We can cache *existing performances* for the CURRENT batch's shows?
|
||||||
|
# Or just cache ALL existing performance keys (show_id, song_id, position)?
|
||||||
|
# Performance table might be large (40k rows?).
|
||||||
|
# (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
|
||||||
|
|
||||||
|
if page == 1:
|
||||||
|
print(" Caching existing performance keys...")
|
||||||
|
perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
|
||||||
|
existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
|
||||||
|
print(f" Cached {len(existing_keys)} performance keys")
|
||||||
|
del perfs
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
batch_added = 0
|
batch_added = 0
|
||||||
|
new_objects = []
|
||||||
|
|
||||||
for perf in data:
|
for perf in data:
|
||||||
total_processed += 1
|
|
||||||
elgoose_show_id = perf.get('show_id')
|
elgoose_show_id = perf.get('show_id')
|
||||||
|
|
||||||
# Check if this performance belongs to a show we care about
|
|
||||||
db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
|
db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
|
||||||
if not db_show_id:
|
if not db_show_id:
|
||||||
continue # Not a Goose show or show not in our DB
|
continue
|
||||||
|
|
||||||
# Resolve Song
|
|
||||||
song_name = perf.get('songname', '').strip()
|
song_name = perf.get('songname', '').strip()
|
||||||
song_id = song_map.get(song_name.lower())
|
song_id = song_map.get(song_name.lower())
|
||||||
|
|
||||||
if not song_id:
|
if not song_id:
|
||||||
# Try creating song if missing?
|
|
||||||
# Ideally we should have imported all songs, but let's be safe
|
|
||||||
# For now skip or log
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
position = perf.get('position', 0)
|
position = perf.get('position', 0)
|
||||||
|
|
||||||
# Check duplication
|
# Check uniqueness
|
||||||
# We can cache existing performances for speed, but SQL check is safer for now
|
if (db_show_id, song_id, position) in existing_keys:
|
||||||
existing = session.exec(
|
|
||||||
select(Performance).where(
|
|
||||||
Performance.show_id == db_show_id,
|
|
||||||
Performance.song_id == song_id,
|
|
||||||
Performance.position == position
|
|
||||||
)
|
|
||||||
).first()
|
|
||||||
|
|
||||||
if existing:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create Performance
|
# Create
|
||||||
# Map setnumber
|
|
||||||
set_val = str(perf.get('setnumber', '1'))
|
set_val = str(perf.get('setnumber', '1'))
|
||||||
if set_val.isdigit():
|
if set_val.isdigit():
|
||||||
set_name = f"Set {set_val}"
|
set_name = f"Set {set_val}"
|
||||||
|
|
@ -173,35 +157,24 @@ def main():
|
||||||
set_name=set_name,
|
set_name=set_name,
|
||||||
segue=bool(perf.get('segue', 0)),
|
segue=bool(perf.get('segue', 0)),
|
||||||
notes=perf.get('footnote'),
|
notes=perf.get('footnote'),
|
||||||
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" # temp slug strategy
|
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}"
|
||||||
)
|
)
|
||||||
session.add(new_perf)
|
new_objects.append(new_perf)
|
||||||
|
existing_keys.add((db_show_id, song_id, position)) # Add to cache
|
||||||
batch_added += 1
|
batch_added += 1
|
||||||
total_added += 1
|
total_added += 1
|
||||||
|
|
||||||
session.commit()
|
if new_objects:
|
||||||
elapsed = time.time() - start_time
|
session.add_all(new_objects)
|
||||||
print(f" Page {page}: Processed {len(data)}, Added {batch_added} ({elapsed:.2f}s)")
|
session.commit()
|
||||||
|
|
||||||
# Optimization: If we see mostly empty adds for many pages,
|
|
||||||
# we might want to skip, BUT setlists endpoint is usually ordered by date desc?
|
|
||||||
# We must go through all history.
|
|
||||||
|
|
||||||
|
print(f" Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True)
|
||||||
page += 1
|
page += 1
|
||||||
if page > 2000: # Safety break
|
|
||||||
break
|
if page % 20 == 0:
|
||||||
|
gc.collect()
|
||||||
# Fix slugs properly
|
|
||||||
print("\n5. Fixing Slugs...")
|
print(f"\nImport Complete! Total Added: {total_added}")
|
||||||
# (Slugs generated above might be generic, ideally update based on show date)
|
|
||||||
# But for speed let's rely on the previous fixer or just update here if needed.
|
|
||||||
# The above slug uses ID which is unique but not pretty.
|
|
||||||
# Let's run a quick update for pretty slugs
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("IMPORT COMPLETE")
|
|
||||||
print(f"Total Added: {total_added}")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue