feat: Add YouTube video import script for performances and shows

This commit is contained in:
fullsizemalt 2025-12-21 22:20:10 -08:00
parent 060797a233
commit 8620841932

185
backend/import_youtube.py Normal file
View file

@ -0,0 +1,185 @@
"""
YouTube Video Import Script
Parses youtube.md and links videos to Performance and Show entities.
"""
import json
import re
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Performance, Show, Song
# Construct YouTube embed URL from videoId
def make_youtube_url(video_id: str) -> str:
return f"https://www.youtube.com/watch?v={video_id}"
def parse_youtube_md(filepath: str) -> list:
"""Extract JSON array from youtube.md markdown file."""
with open(filepath, 'r') as f:
content = f.read()
# Find JSON block (between ```json and ```)
match = re.search(r'```json\s*\n?\s*(\[.*?\])', content, re.DOTALL)
if not match:
print("No JSON block found in file.")
return []
json_str = match.group(1)
# Clean up escaped characters from markdown
json_str = json_str.replace('\\-', '-').replace('\\_', '_')
try:
return json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
return []
def normalize_title(title: str) -> str:
"""Normalize song title for matching."""
return title.strip().lower()
def import_videos(videos: list):
"""Import video links into the database."""
stats = {
'songs_matched': 0,
'songs_not_found': 0,
'sequences_processed': 0,
'full_shows_matched': 0,
'full_shows_not_found': 0,
'skipped': 0
}
with Session(engine) as session:
for video in videos:
video_id = video.get('videoId')
title = video.get('title', '')
video_type = video.get('type', 'song')
date_str = video.get('date')
youtube_url = make_youtube_url(video_id)
if video_type == 'documentary':
print(f"[SKIP] Documentary: {title}")
stats['skipped'] += 1
continue
if video_type == 'visualizer':
print(f"[SKIP] Visualizer: {title}")
stats['skipped'] += 1
continue
if video_type == 'session':
print(f"[SKIP] Session: {title}")
stats['skipped'] += 1
continue
if video_type == 'full_show':
# Match by date or event name
event_name = video.get('event')
if date_str:
show_date = datetime.strptime(date_str, '%Y-%m-%d')
statement = select(Show).where(Show.date == show_date)
show = session.exec(statement).first()
if show:
show.youtube_link = youtube_url
session.add(show)
print(f"[FULL SHOW] Linked: {title} -> Show ID {show.id}")
stats['full_shows_matched'] += 1
else:
print(f"[FULL SHOW NOT FOUND] {title} (date: {date_str})")
stats['full_shows_not_found'] += 1
else:
print(f"[FULL SHOW SKIP] No date for: {title}")
stats['skipped'] += 1
continue
# Parse date
if not date_str:
print(f"[SKIP] No date: {title}")
stats['skipped'] += 1
continue
try:
show_date = datetime.strptime(date_str, '%Y-%m-%d')
except ValueError:
print(f"[SKIP] Invalid date format: {date_str}")
stats['skipped'] += 1
continue
# Find show by date
show_statement = select(Show).where(Show.date == show_date)
show = session.exec(show_statement).first()
if not show:
print(f"[SHOW NOT FOUND] Date: {date_str} for video: {title}")
stats['songs_not_found'] += 1
continue
# Handle sequences (multiple songs)
if video_type == 'sequence' or '' in title:
song_titles = [s.strip() for s in title.split('')]
matched_any = False
for song_title in song_titles:
# Find song by title
song_statement = select(Song).where(Song.title.ilike(f"%{song_title}%"))
songs = session.exec(song_statement).all()
for song in songs:
# Find performance for this song on this show
perf_statement = select(Performance).where(
Performance.show_id == show.id,
Performance.song_id == song.id
)
perf = session.exec(perf_statement).first()
if perf:
perf.youtube_link = youtube_url
session.add(perf)
print(f"[SEQUENCE] Linked: {song_title} -> Performance ID {perf.id}")
matched_any = True
if matched_any:
stats['sequences_processed'] += 1
else:
print(f"[SEQUENCE NOT FOUND] {title} on {date_str}")
stats['songs_not_found'] += 1
continue
# Single song
song_statement = select(Song).where(Song.title.ilike(f"%{title}%"))
songs = session.exec(song_statement).all()
matched = False
for song in songs:
perf_statement = select(Performance).where(
Performance.show_id == show.id,
Performance.song_id == song.id
)
perf = session.exec(perf_statement).first()
if perf:
perf.youtube_link = youtube_url
session.add(perf)
print(f"[SONG] Linked: {title} -> Performance ID {perf.id}")
stats['songs_matched'] += 1
matched = True
break
if not matched:
print(f"[SONG NOT FOUND] {title} on {date_str}")
stats['songs_not_found'] += 1
session.commit()
print("\n=== Import Summary ===")
for key, value in stats.items():
print(f" {key}: {value}")
if __name__ == "__main__":
import sys
filepath = sys.argv[1] if len(sys.argv) > 1 else "../youtube.md"
print(f"Parsing YouTube data from: {filepath}")
videos = parse_youtube_md(filepath)
print(f"Found {len(videos)} videos")
if videos:
import_videos(videos)