""" YouTube Video Import Script Parses youtube.md and links videos to Performance and Show entities. """ import json import re from datetime import datetime from sqlmodel import Session, select from database import engine from models import Performance, Show, Song # Construct YouTube embed URL from videoId def make_youtube_url(video_id: str) -> str: return f"https://www.youtube.com/watch?v={video_id}" def parse_youtube_md(filepath: str) -> list: """Extract JSON array from youtube.md markdown file.""" with open(filepath, 'r') as f: content = f.read() # Find JSON block (between ```json and ```) match = re.search(r'```json\s*\n?\s*(\[.*?\])', content, re.DOTALL) if not match: print("No JSON block found in file.") return [] json_str = match.group(1) # Clean up escaped characters from markdown json_str = json_str.replace('\\-', '-').replace('\\_', '_') try: return json.loads(json_str) except json.JSONDecodeError as e: print(f"JSON parse error: {e}") return [] def normalize_title(title: str) -> str: """Normalize song title for matching.""" return title.strip().lower() def import_videos(videos: list): """Import video links into the database.""" stats = { 'songs_matched': 0, 'songs_not_found': 0, 'sequences_processed': 0, 'full_shows_matched': 0, 'full_shows_not_found': 0, 'skipped': 0 } with Session(engine) as session: for video in videos: video_id = video.get('videoId') title = video.get('title', '') video_type = video.get('type', 'song') date_str = video.get('date') youtube_url = make_youtube_url(video_id) if video_type == 'documentary': print(f"[SKIP] Documentary: {title}") stats['skipped'] += 1 continue if video_type == 'visualizer': print(f"[SKIP] Visualizer: {title}") stats['skipped'] += 1 continue if video_type == 'session': print(f"[SKIP] Session: {title}") stats['skipped'] += 1 continue if video_type == 'full_show': # Match by date or event name event_name = video.get('event') if date_str: show_date = datetime.strptime(date_str, '%Y-%m-%d') statement = select(Show).where(Show.date == show_date) show = session.exec(statement).first() if show: show.youtube_link = youtube_url session.add(show) print(f"[FULL SHOW] Linked: {title} -> Show ID {show.id}") stats['full_shows_matched'] += 1 else: print(f"[FULL SHOW NOT FOUND] {title} (date: {date_str})") stats['full_shows_not_found'] += 1 else: print(f"[FULL SHOW SKIP] No date for: {title}") stats['skipped'] += 1 continue # Parse date if not date_str: print(f"[SKIP] No date: {title}") stats['skipped'] += 1 continue try: show_date = datetime.strptime(date_str, '%Y-%m-%d') except ValueError: print(f"[SKIP] Invalid date format: {date_str}") stats['skipped'] += 1 continue # Find show by date show_statement = select(Show).where(Show.date == show_date) show = session.exec(show_statement).first() if not show: print(f"[SHOW NOT FOUND] Date: {date_str} for video: {title}") stats['songs_not_found'] += 1 continue # Handle sequences (multiple songs) if video_type == 'sequence' or '→' in title: song_titles = [s.strip() for s in title.split('→')] matched_any = False for song_title in song_titles: # Find song by title song_statement = select(Song).where(Song.title.ilike(f"%{song_title}%")) songs = session.exec(song_statement).all() for song in songs: # Find performance for this song on this show perf_statement = select(Performance).where( Performance.show_id == show.id, Performance.song_id == song.id ) perf = session.exec(perf_statement).first() if perf: perf.youtube_link = youtube_url session.add(perf) print(f"[SEQUENCE] Linked: {song_title} -> Performance ID {perf.id}") matched_any = True if matched_any: stats['sequences_processed'] += 1 else: print(f"[SEQUENCE NOT FOUND] {title} on {date_str}") stats['songs_not_found'] += 1 continue # Single song song_statement = select(Song).where(Song.title.ilike(f"%{title}%")) songs = session.exec(song_statement).all() matched = False for song in songs: perf_statement = select(Performance).where( Performance.show_id == show.id, Performance.song_id == song.id ) perf = session.exec(perf_statement).first() if perf: perf.youtube_link = youtube_url session.add(perf) print(f"[SONG] Linked: {title} -> Performance ID {perf.id}") stats['songs_matched'] += 1 matched = True break if not matched: print(f"[SONG NOT FOUND] {title} on {date_str}") stats['songs_not_found'] += 1 session.commit() print("\n=== Import Summary ===") for key, value in stats.items(): print(f" {key}: {value}") if __name__ == "__main__": import sys filepath = sys.argv[1] if len(sys.argv) > 1 else "../youtube.md" print(f"Parsing YouTube data from: {filepath}") videos = parse_youtube_md(filepath) print(f"Found {len(videos)} videos") if videos: import_videos(videos)