""" YouTube Video Import Script v3 Improved title matching with fuzzy logic and normalization. """ import json import re from datetime import datetime from sqlmodel import Session, select from database import engine from models import Performance, Show, Song def make_youtube_url(video_id: str) -> str: return f"https://www.youtube.com/watch?v={video_id}" def normalize_title(title: str) -> str: """Normalize title for better matching.""" title = title.lower().strip() # Remove common suffixes/prefixes title = re.sub(r'\s*\(.*?\)', '', title) # Remove parentheticals title = re.sub(r'\s*\[.*?\]', '', title) # Remove brackets title = re.sub(r'\s*feat\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove feat. title = re.sub(r'\s*ft\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove ft. title = re.sub(r'\s*w/\s+.*$', '', title) # Remove w/ collaborators title = re.sub(r'\s*[-–—]\s*$', '', title) # Trailing dashes # Normalize characters title = title.replace('&', 'and') title = re.sub(r'[^\w\s]', '', title) # Remove punctuation title = re.sub(r'\s+', ' ', title) # Collapse whitespace return title.strip() def extract_song_title(raw_title: str) -> str: """Extract the actual song title from YouTube video title.""" title = raw_title # Remove common prefixes title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE) # Remove date patterns (e.g., "- 12/13/25 Providence, RI") title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title) # Remove "Live at..." suffix title = re.sub(r'\s*[-–—]\s*Live at.*$', '', title, flags=re.IGNORECASE) # Remove "(Official Audio)" etc title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE) # Remove "(4K HDR)" etc title = re.sub(r'\s*\(4K\s*HDR?\)', '', title, flags=re.IGNORECASE) # Remove "Set I/II Opener" etc title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE) # Remove "Live from..." suffix title = re.sub(r'\s*Live from.*$', '', title, flags=re.IGNORECASE) # Remove date at start (e.g., "9/20/2025") title = re.sub(r'^\d{1,2}/\d{1,2}/\d{2,4}\s*', '', title) # Remove location suffix (e.g., "Providence, RI") title = re.sub(r'\s*[-–—]?\s*[A-Z][a-z]+,?\s*[A-Z]{2}\s*$', '', title) return title.strip() def find_song_match(session, song_title: str, all_songs: list) -> Song: """Try multiple matching strategies to find a song.""" normalized_search = normalize_title(song_title) # Strategy 1: Exact match (case insensitive) for song in all_songs: if song.title.lower() == song_title.lower(): return song # Strategy 2: Normalized exact match for song in all_songs: if normalize_title(song.title) == normalized_search: return song # Strategy 3: Starts with (for songs with suffixes in DB) for song in all_songs: if normalize_title(song.title).startswith(normalized_search): return song if normalized_search.startswith(normalize_title(song.title)): return song # Strategy 4: Contains (substring match) for song in all_songs: norm_song = normalize_title(song.title) if len(normalized_search) >= 4: # Avoid short false positives if normalized_search in norm_song or norm_song in normalized_search: return song # Strategy 5: Word overlap (for complex titles) search_words = set(normalized_search.split()) if len(search_words) >= 2: # Only for multi-word titles for song in all_songs: song_words = set(normalize_title(song.title).split()) # If most words match overlap = len(search_words & song_words) if overlap >= len(search_words) * 0.7: return song return None def import_videos(): """Import video links into the database.""" with open("youtube_videos.json", 'r') as f: videos = json.load(f) stats = { 'songs_matched': 0, 'songs_not_found': 0, 'songs_not_found_titles': [], 'sequences_processed': 0, 'full_shows_matched': 0, 'no_date': 0, 'skipped': 0, 'show_not_found': 0 } with Session(engine) as session: # Pre-load all songs for faster matching all_songs = session.exec(select(Song)).all() print(f"Loaded {len(all_songs)} songs from database") for video in videos: video_id = video.get('videoId') raw_title = video.get('title', '') video_type = video.get('type', 'song') date_str = video.get('date') youtube_url = make_youtube_url(video_id) # Skip non-performance content if video_type in ('documentary', 'visualizer', 'session'): stats['skipped'] += 1 continue # Skip videos without dates if not date_str: stats['no_date'] += 1 continue # Parse date try: show_date = datetime.strptime(date_str, '%Y-%m-%d') except ValueError: stats['no_date'] += 1 continue # Find show by date show = session.exec( select(Show).where(Show.date == show_date) ).first() if not show: stats['show_not_found'] += 1 continue # Handle full shows if video_type == 'full_show': show.youtube_link = youtube_url session.add(show) stats['full_shows_matched'] += 1 continue # Extract song title song_title = extract_song_title(raw_title) # Handle sequences if video_type == 'sequence' or '→' in song_title or '>' in song_title: song_titles = [s.strip() for s in re.split(r'[→>]', song_title)] matched_any = False for title in song_titles: if not title or len(title) < 2: continue song = find_song_match(session, title, all_songs) if song: perf = session.exec( select(Performance).where( Performance.show_id == show.id, Performance.song_id == song.id ) ).first() if perf: perf.youtube_link = youtube_url session.add(perf) matched_any = True if matched_any: stats['sequences_processed'] += 1 else: stats['songs_not_found'] += 1 stats['songs_not_found_titles'].append(f"SEQ: {song_title}") continue # Single song matching song = find_song_match(session, song_title, all_songs) if song: perf = session.exec( select(Performance).where( Performance.show_id == show.id, Performance.song_id == song.id ) ).first() if perf: perf.youtube_link = youtube_url session.add(perf) stats['songs_matched'] += 1 else: # Song exists but wasn't played at this show stats['songs_not_found'] += 1 stats['songs_not_found_titles'].append(f"{date_str}: {song_title} (song exists, no perf)") else: stats['songs_not_found'] += 1 stats['songs_not_found_titles'].append(f"{date_str}: {song_title}") session.commit() print("\n" + "="*50) print("IMPORT SUMMARY") print("="*50) print(f" songs_matched: {stats['songs_matched']}") print(f" sequences_processed: {stats['sequences_processed']}") print(f" full_shows_matched: {stats['full_shows_matched']}") print(f" songs_not_found: {stats['songs_not_found']}") print(f" no_date: {stats['no_date']}") print(f" skipped: {stats['skipped']}") print(f" show_not_found: {stats['show_not_found']}") total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched'] print(f"\n TOTAL LINKED: {total_linked}") # Show some unmatched titles for debugging if stats['songs_not_found_titles']: print("\n" + "="*50) print("SAMPLE UNMATCHED (first 20):") print("="*50) for title in stats['songs_not_found_titles'][:20]: print(f" - {title}") if __name__ == "__main__": import_videos()