elmeg-demo/backend/import_youtube.py

"""
YouTube Video Import Script v3
Improved title matching with fuzzy logic and normalization.
"""
import json
import re
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Performance, Show, Song


def make_youtube_url(video_id: str) -> str:
    return f"https://www.youtube.com/watch?v={video_id}"


def normalize_title(title: str) -> str:
    """Normalize title for better matching."""
    title = title.lower().strip()

    # Remove common suffixes/prefixes
    title = re.sub(r'\s*\(.*?\)', '', title)  # Remove parentheticals
    title = re.sub(r'\s*\[.*?\]', '', title)  # Remove brackets
    title = re.sub(r'\s*feat\.?\s+.*$', '', title, flags=re.IGNORECASE)  # Remove feat.
    title = re.sub(r'\s*ft\.?\s+.*$', '', title, flags=re.IGNORECASE)  # Remove ft.
    title = re.sub(r'\s*w/\s+.*$', '', title)  # Remove w/ collaborators
    title = re.sub(r'\s*[-–—]\s*$', '', title)  # Trailing dashes

    # Normalize characters
    title = title.replace('&', 'and')
    title = re.sub(r'[^\w\s]', '', title)  # Remove punctuation
    title = re.sub(r'\s+', ' ', title)  # Collapse whitespace

    return title.strip()


def extract_song_title(raw_title: str) -> str:
    """Extract the actual song title from YouTube video title."""
    title = raw_title

    # Remove common prefixes
    title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)

    # Remove date patterns (e.g., "- 12/13/25 Providence, RI")
    title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)

    # Remove "Live at..." suffix
    title = re.sub(r'\s*[-–—]\s*Live at.*$', '', title, flags=re.IGNORECASE)

    # Remove "(Official Audio)" etc
    title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)

    # Remove "(4K HDR)" etc
    title = re.sub(r'\s*\(4K\s*HDR?\)', '', title, flags=re.IGNORECASE)

    # Remove "Set I/II Opener" etc
    title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)

    # Remove "Live from..." suffix
    title = re.sub(r'\s*Live from.*$', '', title, flags=re.IGNORECASE)

    # Remove date at start (e.g., "9/20/2025")
    title = re.sub(r'^\d{1,2}/\d{1,2}/\d{2,4}\s*', '', title)

    # Remove location suffix (e.g., "Providence, RI")
    title = re.sub(r'\s*[-–—]?\s*[A-Z][a-z]+,?\s*[A-Z]{2}\s*$', '', title)

    return title.strip()


def find_song_match(session, song_title: str, all_songs: list) -> Song:
    """Try multiple matching strategies to find a song."""
    normalized_search = normalize_title(song_title)

    # Strategy 1: Exact match (case insensitive)
    for song in all_songs:
        if song.title.lower() == song_title.lower():
            return song

    # Strategy 2: Normalized exact match
    for song in all_songs:
        if normalize_title(song.title) == normalized_search:
            return song

    # Strategy 3: Starts with (for songs with suffixes in DB)
    for song in all_songs:
        if normalize_title(song.title).startswith(normalized_search):
            return song
        if normalized_search.startswith(normalize_title(song.title)):
            return song

    # Strategy 4: Contains (substring match)
    for song in all_songs:
        norm_song = normalize_title(song.title)
        if len(normalized_search) >= 4:  # Avoid short false positives
            if normalized_search in norm_song or norm_song in normalized_search:
                return song

    # Strategy 5: Word overlap (for complex titles)
    search_words = set(normalized_search.split())
    if len(search_words) >= 2:  # Only for multi-word titles
        for song in all_songs:
            song_words = set(normalize_title(song.title).split())
            # If most words match
            overlap = len(search_words & song_words)
            if overlap >= len(search_words) * 0.7:
                return song

    return None


def import_videos():
    """Import video links into the database."""
    with open("youtube_videos.json", 'r') as f:
        videos = json.load(f)

    stats = {
        'songs_matched': 0,
        'songs_not_found': 0,
        'songs_not_found_titles': [],
        'sequences_processed': 0,
        'full_shows_matched': 0,
        'no_date': 0,
        'skipped': 0,
        'show_not_found': 0
    }

    with Session(engine) as session:
        # Pre-load all songs for faster matching
        all_songs = session.exec(select(Song)).all()
        print(f"Loaded {len(all_songs)} songs from database")

        for video in videos:
            video_id = video.get('videoId')
            raw_title = video.get('title', '')
            video_type = video.get('type', 'song')
            date_str = video.get('date')
            youtube_url = make_youtube_url(video_id)

            # Skip non-performance content
            if video_type in ('documentary', 'visualizer', 'session'):
                stats['skipped'] += 1
                continue

            # Skip videos without dates
            if not date_str:
                stats['no_date'] += 1
                continue

            # Parse date
            try:
                show_date = datetime.strptime(date_str, '%Y-%m-%d')
            except ValueError:
                stats['no_date'] += 1
                continue

            # Find show by date
            show = session.exec(
                select(Show).where(Show.date == show_date)
            ).first()

            if not show:
                stats['show_not_found'] += 1
                continue

            # Handle full shows
            if video_type == 'full_show':
                show.youtube_link = youtube_url
                session.add(show)
                stats['full_shows_matched'] += 1
                continue

            # Extract song title
            song_title = extract_song_title(raw_title)

            # Handle sequences
            if video_type == 'sequence' or '→' in song_title or '>' in song_title:
                song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
                matched_any = False

                for title in song_titles:
                    if not title or len(title) < 2:
                        continue

                    song = find_song_match(session, title, all_songs)
                    if song:
                        perf = session.exec(
                            select(Performance).where(
                                Performance.show_id == show.id,
                                Performance.song_id == song.id
                            )
                        ).first()

                        if perf:
                            perf.youtube_link = youtube_url
                            session.add(perf)
                            matched_any = True

                if matched_any:
                    stats['sequences_processed'] += 1
                else:
                    stats['songs_not_found'] += 1
                    stats['songs_not_found_titles'].append(f"SEQ: {song_title}")
                continue

            # Single song matching
            song = find_song_match(session, song_title, all_songs)

            if song:
                perf = session.exec(
                    select(Performance).where(
                        Performance.show_id == show.id,
                        Performance.song_id == song.id
                    )
                ).first()

                if perf:
                    perf.youtube_link = youtube_url
                    session.add(perf)
                    stats['songs_matched'] += 1
                else:
                    # Song exists but wasn't played at this show
                    stats['songs_not_found'] += 1
                    stats['songs_not_found_titles'].append(f"{date_str}: {song_title} (song exists, no perf)")
            else:
                stats['songs_not_found'] += 1
                stats['songs_not_found_titles'].append(f"{date_str}: {song_title}")

        session.commit()

    print("\n" + "="*50)
    print("IMPORT SUMMARY")
    print("="*50)
    print(f"  songs_matched: {stats['songs_matched']}")
    print(f"  sequences_processed: {stats['sequences_processed']}")
    print(f"  full_shows_matched: {stats['full_shows_matched']}")
    print(f"  songs_not_found: {stats['songs_not_found']}")
    print(f"  no_date: {stats['no_date']}")
    print(f"  skipped: {stats['skipped']}")
    print(f"  show_not_found: {stats['show_not_found']}")

    total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
    print(f"\n  TOTAL LINKED: {total_linked}")

    # Show some unmatched titles for debugging
    if stats['songs_not_found_titles']:
        print("\n" + "="*50)
        print("SAMPLE UNMATCHED (first 20):")
        print("="*50)
        for title in stats['songs_not_found_titles'][:20]:
            print(f"  - {title}")


if __name__ == "__main__":
    import_videos()