Compare commits

..

No commits in common. "29e3e07141603ecfa9da385e9e391e87b5b477a1" and "e2c77d759335350511755bf671ea56623fe0a44e" have entirely different histories.

View file

@ -1,13 +1,16 @@
""" """
Smart Setlist Importer (Streaming Version) Smart Setlist Importer
Reducing memory usage by processing data in streams instead of bulk loading. Uses a 2-step mapping strategy to bypass missing dates in setlist endpoint:
1. Fetch ALL shows from API -> Map ElGoose_ID to Date.
2. Fetch ALL DB shows -> Map Date to DB_ID.
3. Combine: ElGoose_ID -> DB_ID.
4. Import setlists using ElGoose_ID from setlist entries.
""" """
import requests import requests
import time import time
import gc
from datetime import datetime from datetime import datetime
from sqlmodel import Session, select from sqlmodel import Session, select, func
from database import engine from database import engine
from models import Show, Song, Performance from models import Show, Song, Performance
from slugify import generate_slug from slugify import generate_slug
@ -30,124 +33,126 @@ def fetch_json(endpoint, params=None):
time.sleep(2) time.sleep(2)
return None return None
def fetch_all_pages(endpoint, params=None):
"""Fetch all pages from an endpoint"""
if params is None:
params = {}
results = []
page = 1
while True:
print(f" Fetching {endpoint} page {page}...", end="\r", flush=True)
p = params.copy()
p['page'] = page
data = fetch_json(endpoint, p)
if not data:
break
results.extend(data)
page += 1
time.sleep(0.1) # Be nice
print(f"\n Fetched {len(results)} items from {endpoint}")
return results
def main(): def main():
print("=" * 60) print("=" * 60)
print("SMART SETLIST IMPORTER (STREAMING)") print("SMART SETLIST IMPORTER")
print("=" * 60) print("=" * 60)
with Session(engine) as session: with Session(engine) as session:
# 1. Build DB Map: Date string -> DB Show ID # 1. Build DB Map: Date string -> DB Show ID
print("\n1. Building DB Map (Date -> Show ID)...") print("\n1. Building DB Map (Date -> Show ID)...")
shows = session.exec(select(Show.id, Show.date)).all() # Only fetch needed fields shows = session.exec(select(Show)).all()
date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows} date_to_db_id = {s.date.strftime('%Y-%m-%d'): s.id for s in shows}
print(f" Mapped {len(date_to_db_id)} existing shows in DB") print(f" Mapped {len(date_to_db_id)} existing shows in DB")
if not date_to_db_id: if not date_to_db_id:
print(" CRITICAL: No shows in database!") print(" CRITICAL: No shows in database! Run import_shows first.")
return return
del shows # 2. Build API Map: ElGoose ID -> Date
gc.collect() print("\n2. Fetching API Shows to build ElGoose ID -> Date map...")
# Only fetch shows for our artist (Goose = 3)
# 2. Build API Map: ElGoose ID -> DB ID api_shows = fetch_all_pages("shows", {"artist": 3})
# Process iteratively to save memory if not api_shows:
print("\n2. Building ElGoose ID -> DB ID map (Streaming)...") # Fallback if artist filter fails or returns empty
print(" Artist filter returned empty, fetching all shows...")
api_shows = fetch_all_pages("shows")
elgoose_id_to_db_id = {} elgoose_id_to_db_id = {}
matched_count = 0 matched_count = 0
page = 1 for s in api_shows:
seen_ids_in_run = set() s_date = s.get('showdate')
s_id = s.get('show_id')
if s_date and s_id:
# Lookup in DB map
db_id = date_to_db_id.get(s_date)
if db_id:
elgoose_id_to_db_id[s_id] = db_id
matched_count += 1
while True: print(f" Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs")
# Fetch batch of shows
print(f" Fetching shows page {page}...", end="\r", flush=True)
data = fetch_json("shows", {"page": page}) # Fetch all shows (artist filter can be flaky)
if not data:
break
# Check for API loop (if Page X returns same content as Page 1)
first_id_in_batch = data[0].get('show_id') if data else None
if first_id_in_batch and first_id_in_batch in seen_ids_in_run:
print(f"\n Loop detected at page {page} (ID {first_id_in_batch} seen before). Breaking.")
break
for s in data:
# We only need Goose shows (artist_id=3 usually, but we check date match)
s_date = s.get('showdate')
s_id = s.get('show_id')
if s_id:
seen_ids_in_run.add(s_id)
if s_date and s_id:
db_id = date_to_db_id.get(s_date)
if db_id:
elgoose_id_to_db_id[s_id] = db_id
matched_count += 1
page += 1
if page % 10 == 0:
gc.collect()
print(f"\n Mapped {len(elgoose_id_to_db_id)} ElGoose IDs to DB IDs") # 3. Cache Songs for Lookup
del date_to_db_id
gc.collect()
# 3. Cache Songs
print("\n3. Caching Songs...") print("\n3. Caching Songs...")
songs = session.exec(select(Song.id, Song.title)).all() songs = session.exec(select(Song)).all()
song_map = {s.title.lower().strip(): s.id for s in songs} song_map = {s.title.lower().strip(): s.id for s in songs} # title -> id
del songs
gc.collect()
print(f" Cached {len(song_map)} songs") print(f" Cached {len(song_map)} songs")
# 4. Process Setlists # 4. Fetch and Import Setlists
print("\n4. Importing Setlists...") print("\n4. Fetching Setlists and Importing...")
# Since we can't filter setlists by artist easily without checking every item,
# we'll fetch all and filter by our known show IDs.
page = 1 page = 1
total_added = 0 total_added = 0
total_processed = 0
while True: while True:
start_time = time.time()
data = fetch_json("setlists", {"page": page}) data = fetch_json("setlists", {"page": page})
if not data: if not data:
print(" No more data.")
break break
# Prefetch checks for this batch to avoid N+1 SELECTs?
# Actually with 3600 perfs, one-by-one check is slow.
# But "existing check" is needed.
# We can cache *existing performances* for the CURRENT batch's shows?
# Or just cache ALL existing performance keys (show_id, song_id, position)?
# Performance table might be large (40k rows?).
# (show_id, song_id, position) tuples set is ~2MB RAM. Safe.
if page == 1:
print(" Caching existing performance keys...")
perfs = session.exec(select(Performance.show_id, Performance.song_id, Performance.position)).all()
existing_keys = set((p.show_id, p.song_id, p.position) for p in perfs)
print(f" Cached {len(existing_keys)} performance keys")
del perfs
gc.collect()
batch_added = 0 batch_added = 0
new_objects = []
for perf in data: for perf in data:
total_processed += 1
elgoose_show_id = perf.get('show_id') elgoose_show_id = perf.get('show_id')
# Check if this performance belongs to a show we care about
db_show_id = elgoose_id_to_db_id.get(elgoose_show_id) db_show_id = elgoose_id_to_db_id.get(elgoose_show_id)
if not db_show_id: if not db_show_id:
continue continue # Not a Goose show or show not in our DB
# Resolve Song
song_name = perf.get('songname', '').strip() song_name = perf.get('songname', '').strip()
song_id = song_map.get(song_name.lower()) song_id = song_map.get(song_name.lower())
if not song_id: if not song_id:
# Try creating song if missing?
# Ideally we should have imported all songs, but let's be safe
# For now skip or log
continue continue
position = perf.get('position', 0) position = perf.get('position', 0)
# Check uniqueness # Check duplication
if (db_show_id, song_id, position) in existing_keys: # We can cache existing performances for speed, but SQL check is safer for now
existing = session.exec(
select(Performance).where(
Performance.show_id == db_show_id,
Performance.song_id == song_id,
Performance.position == position
)
).first()
if existing:
continue continue
# Create # Create Performance
# Map setnumber
set_val = str(perf.get('setnumber', '1')) set_val = str(perf.get('setnumber', '1'))
if set_val.isdigit(): if set_val.isdigit():
set_name = f"Set {set_val}" set_name = f"Set {set_val}"
@ -168,24 +173,35 @@ def main():
set_name=set_name, set_name=set_name,
segue=bool(perf.get('segue', 0)), segue=bool(perf.get('segue', 0)),
notes=perf.get('footnote'), notes=perf.get('footnote'),
slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" slug=f"{generate_slug(song_name)}-{db_show_id}-{position}" # temp slug strategy
) )
new_objects.append(new_perf) session.add(new_perf)
existing_keys.add((db_show_id, song_id, position)) # Add to cache
batch_added += 1 batch_added += 1
total_added += 1 total_added += 1
if new_objects: session.commit()
session.add_all(new_objects) elapsed = time.time() - start_time
session.commit() print(f" Page {page}: Processed {len(data)}, Added {batch_added} ({elapsed:.2f}s)")
# Optimization: If we see mostly empty adds for many pages,
# we might want to skip, BUT setlists endpoint is usually ordered by date desc?
# We must go through all history.
print(f" Page {page}: Added {batch_added} (Total {total_added})", end="\r", flush=True)
page += 1 page += 1
if page > 2000: # Safety break
if page % 20 == 0: break
gc.collect()
# Fix slugs properly
print(f"\nImport Complete! Total Added: {total_added}") print("\n5. Fixing Slugs...")
# (Slugs generated above might be generic, ideally update based on show date)
# But for speed let's rely on the previous fixer or just update here if needed.
# The above slug uses ID which is unique but not pretty.
# Let's run a quick update for pretty slugs
print("\n" + "=" * 60)
print("IMPORT COMPLETE")
print(f"Total Added: {total_added}")
print("=" * 60)
if __name__ == "__main__": if __name__ == "__main__":
main() main()