elmeg-demo/backend/import_youtube.py

"""
YouTube Video Import Script v2
Imports videos from youtube_videos.json into the database.
"""
import json
import re
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Performance, Show, Song


def make_youtube_url(video_id: str) -> str:
    return f"https://www.youtube.com/watch?v={video_id}"


def extract_song_title(title: str) -> str:
    """Extract the actual song title from YouTube video title."""
    # Remove common prefixes
    title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)

    # Remove date patterns at end (e.g., "- 12/13/25 Providence, RI")
    title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)

    # Remove "Live at..." suffix
    title = re.sub(r'\s*[-–—]\s*Live at.*$', '', title, flags=re.IGNORECASE)

    # Remove "(Official Audio)" etc
    title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)

    # Remove "(4K HDR)" etc
    title = re.sub(r'\s*\(4K\s*HDR\)', '', title, flags=re.IGNORECASE)

    # Remove "Set I Opener" etc
    title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)

    return title.strip()


def import_videos():
    """Import video links into the database."""
    with open("youtube_videos.json", 'r') as f:
        videos = json.load(f)

    stats = {
        'songs_matched': 0,
        'songs_not_found': 0,
        'sequences_processed': 0,
        'full_shows_matched': 0,
        'full_shows_not_found': 0,
        'no_date': 0,
        'skipped': 0,
        'show_not_found': 0
    }

    with Session(engine) as session:
        for video in videos:
            video_id = video.get('videoId')
            raw_title = video.get('title', '')
            video_type = video.get('type', 'song')
            date_str = video.get('date')
            youtube_url = make_youtube_url(video_id)

            # Skip non-performance content
            if video_type in ('documentary', 'visualizer', 'session'):
                stats['skipped'] += 1
                continue

            # Skip videos without dates (can't match to show)
            if not date_str:
                stats['no_date'] += 1
                continue

            # Parse date
            try:
                show_date = datetime.strptime(date_str, '%Y-%m-%d')
            except ValueError:
                stats['no_date'] += 1
                continue

            # Find show by date
            show = session.exec(
                select(Show).where(Show.date == show_date)
            ).first()

            if not show:
                stats['show_not_found'] += 1
                continue

            # Handle full shows - link to Show entity
            if video_type == 'full_show':
                show.youtube_link = youtube_url
                session.add(show)
                stats['full_shows_matched'] += 1
                print(f"[FULL SHOW] {date_str}: {raw_title[:50]}")
                continue

            # Extract song title
            song_title = extract_song_title(raw_title)

            # Handle sequences (multiple songs with →)
            if video_type == 'sequence' or '→' in song_title:
                song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
                matched_any = False

                for title in song_titles:
                    if not title:
                        continue
                    # Find song by title (case insensitive partial match)
                    songs = session.exec(
                        select(Song).where(Song.title.ilike(f"%{title}%"))
                    ).all()

                    for song in songs:
                        perf = session.exec(
                            select(Performance).where(
                                Performance.show_id == show.id,
                                Performance.song_id == song.id
                            )
                        ).first()

                        if perf:
                            perf.youtube_link = youtube_url
                            session.add(perf)
                            matched_any = True
                            print(f"[SEQ] {date_str}: {title} -> Perf {perf.id}")

                if matched_any:
                    stats['sequences_processed'] += 1
                else:
                    stats['songs_not_found'] += 1
                continue

            # Single song - find and link
            songs = session.exec(
                select(Song).where(Song.title.ilike(f"%{song_title}%"))
            ).all()

            matched = False
            for song in songs:
                perf = session.exec(
                    select(Performance).where(
                        Performance.show_id == show.id,
                        Performance.song_id == song.id
                    )
                ).first()

                if perf:
                    perf.youtube_link = youtube_url
                    session.add(perf)
                    matched = True
                    stats['songs_matched'] += 1
                    print(f"[SONG] {date_str}: {song_title} -> Perf {perf.id}")
                    break

            if not matched:
                stats['songs_not_found'] += 1

        session.commit()

    print("\n" + "="*50)
    print("IMPORT SUMMARY")
    print("="*50)
    for key, value in stats.items():
        print(f"  {key}: {value}")

    total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
    print(f"\n  TOTAL LINKED: {total_linked}")


if __name__ == "__main__":
    import_videos()