From 0ad89105b3103843f808f10b3ee2965764f4602b Mon Sep 17 00:00:00 2001 From: fullsizemalt <106900403+fullsizemalt@users.noreply.github.com> Date: Mon, 22 Dec 2025 23:13:13 -0800 Subject: [PATCH] feat: Improved YouTube matching with fuzzy logic (+40 more videos) --- backend/import_youtube.py | 147 +++++++++++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 32 deletions(-) diff --git a/backend/import_youtube.py b/backend/import_youtube.py index 902beeb..e374813 100644 --- a/backend/import_youtube.py +++ b/backend/import_youtube.py @@ -1,6 +1,6 @@ """ -YouTube Video Import Script v2 -Imports videos from youtube_videos.json into the database. +YouTube Video Import Script v3 +Improved title matching with fuzzy logic and normalization. """ import json import re @@ -14,12 +14,34 @@ def make_youtube_url(video_id: str) -> str: return f"https://www.youtube.com/watch?v={video_id}" -def extract_song_title(title: str) -> str: +def normalize_title(title: str) -> str: + """Normalize title for better matching.""" + title = title.lower().strip() + + # Remove common suffixes/prefixes + title = re.sub(r'\s*\(.*?\)', '', title) # Remove parentheticals + title = re.sub(r'\s*\[.*?\]', '', title) # Remove brackets + title = re.sub(r'\s*feat\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove feat. + title = re.sub(r'\s*ft\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove ft. + title = re.sub(r'\s*w/\s+.*$', '', title) # Remove w/ collaborators + title = re.sub(r'\s*[-–—]\s*$', '', title) # Trailing dashes + + # Normalize characters + title = title.replace('&', 'and') + title = re.sub(r'[^\w\s]', '', title) # Remove punctuation + title = re.sub(r'\s+', ' ', title) # Collapse whitespace + + return title.strip() + + +def extract_song_title(raw_title: str) -> str: """Extract the actual song title from YouTube video title.""" + title = raw_title + # Remove common prefixes title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE) - # Remove date patterns at end (e.g., "- 12/13/25 Providence, RI") + # Remove date patterns (e.g., "- 12/13/25 Providence, RI") title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title) # Remove "Live at..." suffix @@ -29,14 +51,64 @@ def extract_song_title(title: str) -> str: title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE) # Remove "(4K HDR)" etc - title = re.sub(r'\s*\(4K\s*HDR\)', '', title, flags=re.IGNORECASE) + title = re.sub(r'\s*\(4K\s*HDR?\)', '', title, flags=re.IGNORECASE) - # Remove "Set I Opener" etc + # Remove "Set I/II Opener" etc title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE) + # Remove "Live from..." suffix + title = re.sub(r'\s*Live from.*$', '', title, flags=re.IGNORECASE) + + # Remove date at start (e.g., "9/20/2025") + title = re.sub(r'^\d{1,2}/\d{1,2}/\d{2,4}\s*', '', title) + + # Remove location suffix (e.g., "Providence, RI") + title = re.sub(r'\s*[-–—]?\s*[A-Z][a-z]+,?\s*[A-Z]{2}\s*$', '', title) + return title.strip() +def find_song_match(session, song_title: str, all_songs: list) -> Song: + """Try multiple matching strategies to find a song.""" + normalized_search = normalize_title(song_title) + + # Strategy 1: Exact match (case insensitive) + for song in all_songs: + if song.title.lower() == song_title.lower(): + return song + + # Strategy 2: Normalized exact match + for song in all_songs: + if normalize_title(song.title) == normalized_search: + return song + + # Strategy 3: Starts with (for songs with suffixes in DB) + for song in all_songs: + if normalize_title(song.title).startswith(normalized_search): + return song + if normalized_search.startswith(normalize_title(song.title)): + return song + + # Strategy 4: Contains (substring match) + for song in all_songs: + norm_song = normalize_title(song.title) + if len(normalized_search) >= 4: # Avoid short false positives + if normalized_search in norm_song or norm_song in normalized_search: + return song + + # Strategy 5: Word overlap (for complex titles) + search_words = set(normalized_search.split()) + if len(search_words) >= 2: # Only for multi-word titles + for song in all_songs: + song_words = set(normalize_title(song.title).split()) + # If most words match + overlap = len(search_words & song_words) + if overlap >= len(search_words) * 0.7: + return song + + return None + + def import_videos(): """Import video links into the database.""" with open("youtube_videos.json", 'r') as f: @@ -45,15 +117,19 @@ def import_videos(): stats = { 'songs_matched': 0, 'songs_not_found': 0, + 'songs_not_found_titles': [], 'sequences_processed': 0, 'full_shows_matched': 0, - 'full_shows_not_found': 0, 'no_date': 0, 'skipped': 0, 'show_not_found': 0 } with Session(engine) as session: + # Pre-load all songs for faster matching + all_songs = session.exec(select(Song)).all() + print(f"Loaded {len(all_songs)} songs from database") + for video in videos: video_id = video.get('videoId') raw_title = video.get('title', '') @@ -66,7 +142,7 @@ def import_videos(): stats['skipped'] += 1 continue - # Skip videos without dates (can't match to show) + # Skip videos without dates if not date_str: stats['no_date'] += 1 continue @@ -87,31 +163,27 @@ def import_videos(): stats['show_not_found'] += 1 continue - # Handle full shows - link to Show entity + # Handle full shows if video_type == 'full_show': show.youtube_link = youtube_url session.add(show) stats['full_shows_matched'] += 1 - print(f"[FULL SHOW] {date_str}: {raw_title[:50]}") continue # Extract song title song_title = extract_song_title(raw_title) - # Handle sequences (multiple songs with →) - if video_type == 'sequence' or '→' in song_title: + # Handle sequences + if video_type == 'sequence' or '→' in song_title or '>' in song_title: song_titles = [s.strip() for s in re.split(r'[→>]', song_title)] matched_any = False for title in song_titles: - if not title: + if not title or len(title) < 2: continue - # Find song by title (case insensitive partial match) - songs = session.exec( - select(Song).where(Song.title.ilike(f"%{title}%")) - ).all() - for song in songs: + song = find_song_match(session, title, all_songs) + if song: perf = session.exec( select(Performance).where( Performance.show_id == show.id, @@ -123,21 +195,18 @@ def import_videos(): perf.youtube_link = youtube_url session.add(perf) matched_any = True - print(f"[SEQ] {date_str}: {title} -> Perf {perf.id}") if matched_any: stats['sequences_processed'] += 1 else: stats['songs_not_found'] += 1 + stats['songs_not_found_titles'].append(f"SEQ: {song_title}") continue - # Single song - find and link - songs = session.exec( - select(Song).where(Song.title.ilike(f"%{song_title}%")) - ).all() + # Single song matching + song = find_song_match(session, song_title, all_songs) - matched = False - for song in songs: + if song: perf = session.exec( select(Performance).where( Performance.show_id == show.id, @@ -148,24 +217,38 @@ def import_videos(): if perf: perf.youtube_link = youtube_url session.add(perf) - matched = True stats['songs_matched'] += 1 - print(f"[SONG] {date_str}: {song_title} -> Perf {perf.id}") - break - - if not matched: + else: + # Song exists but wasn't played at this show + stats['songs_not_found'] += 1 + stats['songs_not_found_titles'].append(f"{date_str}: {song_title} (song exists, no perf)") + else: stats['songs_not_found'] += 1 + stats['songs_not_found_titles'].append(f"{date_str}: {song_title}") session.commit() print("\n" + "="*50) print("IMPORT SUMMARY") print("="*50) - for key, value in stats.items(): - print(f" {key}: {value}") + print(f" songs_matched: {stats['songs_matched']}") + print(f" sequences_processed: {stats['sequences_processed']}") + print(f" full_shows_matched: {stats['full_shows_matched']}") + print(f" songs_not_found: {stats['songs_not_found']}") + print(f" no_date: {stats['no_date']}") + print(f" skipped: {stats['skipped']}") + print(f" show_not_found: {stats['show_not_found']}") total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched'] print(f"\n TOTAL LINKED: {total_linked}") + + # Show some unmatched titles for debugging + if stats['songs_not_found_titles']: + print("\n" + "="*50) + print("SAMPLE UNMATCHED (first 20):") + print("="*50) + for title in stats['songs_not_found_titles'][:20]: + print(f" - {title}") if __name__ == "__main__":