From 0ad89105b3103843f808f10b3ee2965764f4602b Mon Sep 17 00:00:00 2001
From: fullsizemalt <106900403+fullsizemalt@users.noreply.github.com>
Date: Mon, 22 Dec 2025 23:13:13 -0800
Subject: [PATCH] feat: Improved YouTube matching with fuzzy logic (+40 more
 videos)

---
 backend/import_youtube.py | 147 +++++++++++++++++++++++++++++---------
 1 file changed, 115 insertions(+), 32 deletions(-)

diff --git a/backend/import_youtube.py b/backend/import_youtube.py
index 902beeb..e374813 100644
--- a/backend/import_youtube.py
+++ b/backend/import_youtube.py
@@ -1,6 +1,6 @@
 """
-YouTube Video Import Script v2
-Imports videos from youtube_videos.json into the database.
+YouTube Video Import Script v3
+Improved title matching with fuzzy logic and normalization.
 """
 import json
 import re
@@ -14,12 +14,34 @@ def make_youtube_url(video_id: str) -> str:
     return f"https://www.youtube.com/watch?v={video_id}"
 
 
-def extract_song_title(title: str) -> str:
+def normalize_title(title: str) -> str:
+    """Normalize title for better matching."""
+    title = title.lower().strip()
+    
+    # Remove common suffixes/prefixes
+    title = re.sub(r'\s*\(.*?\)', '', title)  # Remove parentheticals
+    title = re.sub(r'\s*\[.*?\]', '', title)  # Remove brackets
+    title = re.sub(r'\s*feat\.?\s+.*$', '', title, flags=re.IGNORECASE)  # Remove feat. 
+    title = re.sub(r'\s*ft\.?\s+.*$', '', title, flags=re.IGNORECASE)  # Remove ft.
+    title = re.sub(r'\s*w/\s+.*$', '', title)  # Remove w/ collaborators
+    title = re.sub(r'\s*[-–—]\s*$', '', title)  # Trailing dashes
+    
+    # Normalize characters
+    title = title.replace('&', 'and')
+    title = re.sub(r'[^\w\s]', '', title)  # Remove punctuation
+    title = re.sub(r'\s+', ' ', title)  # Collapse whitespace
+    
+    return title.strip()
+
+
+def extract_song_title(raw_title: str) -> str:
     """Extract the actual song title from YouTube video title."""
+    title = raw_title
+    
     # Remove common prefixes
     title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)
     
-    # Remove date patterns at end (e.g., "- 12/13/25 Providence, RI")
+    # Remove date patterns (e.g., "- 12/13/25 Providence, RI")
     title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)
     
     # Remove "Live at..." suffix
@@ -29,14 +51,64 @@ def extract_song_title(title: str) -> str:
     title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)
     
     # Remove "(4K HDR)" etc
-    title = re.sub(r'\s*\(4K\s*HDR\)', '', title, flags=re.IGNORECASE)
+    title = re.sub(r'\s*\(4K\s*HDR?\)', '', title, flags=re.IGNORECASE)
     
-    # Remove "Set I Opener" etc
+    # Remove "Set I/II Opener" etc
     title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)
     
+    # Remove "Live from..." suffix
+    title = re.sub(r'\s*Live from.*$', '', title, flags=re.IGNORECASE)
+    
+    # Remove date at start (e.g., "9/20/2025")
+    title = re.sub(r'^\d{1,2}/\d{1,2}/\d{2,4}\s*', '', title)
+    
+    # Remove location suffix (e.g., "Providence, RI")
+    title = re.sub(r'\s*[-–—]?\s*[A-Z][a-z]+,?\s*[A-Z]{2}\s*$', '', title)
+    
     return title.strip()
 
 
+def find_song_match(session, song_title: str, all_songs: list) -> Song:
+    """Try multiple matching strategies to find a song."""
+    normalized_search = normalize_title(song_title)
+    
+    # Strategy 1: Exact match (case insensitive)
+    for song in all_songs:
+        if song.title.lower() == song_title.lower():
+            return song
+    
+    # Strategy 2: Normalized exact match
+    for song in all_songs:
+        if normalize_title(song.title) == normalized_search:
+            return song
+    
+    # Strategy 3: Starts with (for songs with suffixes in DB)
+    for song in all_songs:
+        if normalize_title(song.title).startswith(normalized_search):
+            return song
+        if normalized_search.startswith(normalize_title(song.title)):
+            return song
+    
+    # Strategy 4: Contains (substring match)
+    for song in all_songs:
+        norm_song = normalize_title(song.title)
+        if len(normalized_search) >= 4:  # Avoid short false positives
+            if normalized_search in norm_song or norm_song in normalized_search:
+                return song
+    
+    # Strategy 5: Word overlap (for complex titles)
+    search_words = set(normalized_search.split())
+    if len(search_words) >= 2:  # Only for multi-word titles
+        for song in all_songs:
+            song_words = set(normalize_title(song.title).split())
+            # If most words match
+            overlap = len(search_words & song_words)
+            if overlap >= len(search_words) * 0.7:
+                return song
+    
+    return None
+
+
 def import_videos():
     """Import video links into the database."""
     with open("youtube_videos.json", 'r') as f:
@@ -45,15 +117,19 @@ def import_videos():
     stats = {
         'songs_matched': 0,
         'songs_not_found': 0,
+        'songs_not_found_titles': [],
         'sequences_processed': 0,
         'full_shows_matched': 0,
-        'full_shows_not_found': 0,
         'no_date': 0,
         'skipped': 0,
         'show_not_found': 0
     }
     
     with Session(engine) as session:
+        # Pre-load all songs for faster matching
+        all_songs = session.exec(select(Song)).all()
+        print(f"Loaded {len(all_songs)} songs from database")
+        
         for video in videos:
             video_id = video.get('videoId')
             raw_title = video.get('title', '')
@@ -66,7 +142,7 @@ def import_videos():
                 stats['skipped'] += 1
                 continue
             
-            # Skip videos without dates (can't match to show)
+            # Skip videos without dates
             if not date_str:
                 stats['no_date'] += 1
                 continue
@@ -87,31 +163,27 @@ def import_videos():
                 stats['show_not_found'] += 1
                 continue
             
-            # Handle full shows - link to Show entity
+            # Handle full shows
             if video_type == 'full_show':
                 show.youtube_link = youtube_url
                 session.add(show)
                 stats['full_shows_matched'] += 1
-                print(f"[FULL SHOW] {date_str}: {raw_title[:50]}")
                 continue
             
             # Extract song title
             song_title = extract_song_title(raw_title)
             
-            # Handle sequences (multiple songs with →)
-            if video_type == 'sequence' or '→' in song_title:
+            # Handle sequences
+            if video_type == 'sequence' or '→' in song_title or '>' in song_title:
                 song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
                 matched_any = False
                 
                 for title in song_titles:
-                    if not title:
+                    if not title or len(title) < 2:
                         continue
-                    # Find song by title (case insensitive partial match)
-                    songs = session.exec(
-                        select(Song).where(Song.title.ilike(f"%{title}%"))
-                    ).all()
                     
-                    for song in songs:
+                    song = find_song_match(session, title, all_songs)
+                    if song:
                         perf = session.exec(
                             select(Performance).where(
                                 Performance.show_id == show.id,
@@ -123,21 +195,18 @@ def import_videos():
                             perf.youtube_link = youtube_url
                             session.add(perf)
                             matched_any = True
-                            print(f"[SEQ] {date_str}: {title} -> Perf {perf.id}")
                 
                 if matched_any:
                     stats['sequences_processed'] += 1
                 else:
                     stats['songs_not_found'] += 1
+                    stats['songs_not_found_titles'].append(f"SEQ: {song_title}")
                 continue
             
-            # Single song - find and link
-            songs = session.exec(
-                select(Song).where(Song.title.ilike(f"%{song_title}%"))
-            ).all()
+            # Single song matching
+            song = find_song_match(session, song_title, all_songs)
             
-            matched = False
-            for song in songs:
+            if song:
                 perf = session.exec(
                     select(Performance).where(
                         Performance.show_id == show.id,
@@ -148,24 +217,38 @@ def import_videos():
                 if perf:
                     perf.youtube_link = youtube_url
                     session.add(perf)
-                    matched = True
                     stats['songs_matched'] += 1
-                    print(f"[SONG] {date_str}: {song_title} -> Perf {perf.id}")
-                    break
-            
-            if not matched:
+                else:
+                    # Song exists but wasn't played at this show
+                    stats['songs_not_found'] += 1
+                    stats['songs_not_found_titles'].append(f"{date_str}: {song_title} (song exists, no perf)")
+            else:
                 stats['songs_not_found'] += 1
+                stats['songs_not_found_titles'].append(f"{date_str}: {song_title}")
         
         session.commit()
     
     print("\n" + "="*50)
     print("IMPORT SUMMARY")
     print("="*50)
-    for key, value in stats.items():
-        print(f"  {key}: {value}")
+    print(f"  songs_matched: {stats['songs_matched']}")
+    print(f"  sequences_processed: {stats['sequences_processed']}")
+    print(f"  full_shows_matched: {stats['full_shows_matched']}")
+    print(f"  songs_not_found: {stats['songs_not_found']}")
+    print(f"  no_date: {stats['no_date']}")
+    print(f"  skipped: {stats['skipped']}")
+    print(f"  show_not_found: {stats['show_not_found']}")
     
     total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
     print(f"\n  TOTAL LINKED: {total_linked}")
+    
+    # Show some unmatched titles for debugging
+    if stats['songs_not_found_titles']:
+        print("\n" + "="*50)
+        print("SAMPLE UNMATCHED (first 20):")
+        print("="*50)
+        for title in stats['songs_not_found_titles'][:20]:
+            print(f"  - {title}")
 
 
 if __name__ == "__main__":