elmeg-demo/backend/import_youtube.py

172 lines
5.8 KiB
Python

"""
YouTube Video Import Script v2
Imports videos from youtube_videos.json into the database.
"""
import json
import re
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Performance, Show, Song
def make_youtube_url(video_id: str) -> str:
return f"https://www.youtube.com/watch?v={video_id}"
def extract_song_title(title: str) -> str:
"""Extract the actual song title from YouTube video title."""
# Remove common prefixes
title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)
# Remove date patterns at end (e.g., "- 12/13/25 Providence, RI")
title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)
# Remove "Live at..." suffix
title = re.sub(r'\s*[-–—]\s*Live at.*$', '', title, flags=re.IGNORECASE)
# Remove "(Official Audio)" etc
title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)
# Remove "(4K HDR)" etc
title = re.sub(r'\s*\(4K\s*HDR\)', '', title, flags=re.IGNORECASE)
# Remove "Set I Opener" etc
title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)
return title.strip()
def import_videos():
"""Import video links into the database."""
with open("youtube_videos.json", 'r') as f:
videos = json.load(f)
stats = {
'songs_matched': 0,
'songs_not_found': 0,
'sequences_processed': 0,
'full_shows_matched': 0,
'full_shows_not_found': 0,
'no_date': 0,
'skipped': 0,
'show_not_found': 0
}
with Session(engine) as session:
for video in videos:
video_id = video.get('videoId')
raw_title = video.get('title', '')
video_type = video.get('type', 'song')
date_str = video.get('date')
youtube_url = make_youtube_url(video_id)
# Skip non-performance content
if video_type in ('documentary', 'visualizer', 'session'):
stats['skipped'] += 1
continue
# Skip videos without dates (can't match to show)
if not date_str:
stats['no_date'] += 1
continue
# Parse date
try:
show_date = datetime.strptime(date_str, '%Y-%m-%d')
except ValueError:
stats['no_date'] += 1
continue
# Find show by date
show = session.exec(
select(Show).where(Show.date == show_date)
).first()
if not show:
stats['show_not_found'] += 1
continue
# Handle full shows - link to Show entity
if video_type == 'full_show':
show.youtube_link = youtube_url
session.add(show)
stats['full_shows_matched'] += 1
print(f"[FULL SHOW] {date_str}: {raw_title[:50]}")
continue
# Extract song title
song_title = extract_song_title(raw_title)
# Handle sequences (multiple songs with →)
if video_type == 'sequence' or '' in song_title:
song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
matched_any = False
for title in song_titles:
if not title:
continue
# Find song by title (case insensitive partial match)
songs = session.exec(
select(Song).where(Song.title.ilike(f"%{title}%"))
).all()
for song in songs:
perf = session.exec(
select(Performance).where(
Performance.show_id == show.id,
Performance.song_id == song.id
)
).first()
if perf:
perf.youtube_link = youtube_url
session.add(perf)
matched_any = True
print(f"[SEQ] {date_str}: {title} -> Perf {perf.id}")
if matched_any:
stats['sequences_processed'] += 1
else:
stats['songs_not_found'] += 1
continue
# Single song - find and link
songs = session.exec(
select(Song).where(Song.title.ilike(f"%{song_title}%"))
).all()
matched = False
for song in songs:
perf = session.exec(
select(Performance).where(
Performance.show_id == show.id,
Performance.song_id == song.id
)
).first()
if perf:
perf.youtube_link = youtube_url
session.add(perf)
matched = True
stats['songs_matched'] += 1
print(f"[SONG] {date_str}: {song_title} -> Perf {perf.id}")
break
if not matched:
stats['songs_not_found'] += 1
session.commit()
print("\n" + "="*50)
print("IMPORT SUMMARY")
print("="*50)
for key, value in stats.items():
print(f" {key}: {value}")
total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
print(f"\n TOTAL LINKED: {total_linked}")
if __name__ == "__main__":
import_videos()