feat: Improved YouTube matching with fuzzy logic (+40 more videos)
This commit is contained in:
parent
dc584af2f2
commit
0ad89105b3
1 changed files with 115 additions and 32 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
"""
|
"""
|
||||||
YouTube Video Import Script v2
|
YouTube Video Import Script v3
|
||||||
Imports videos from youtube_videos.json into the database.
|
Improved title matching with fuzzy logic and normalization.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
@ -14,12 +14,34 @@ def make_youtube_url(video_id: str) -> str:
|
||||||
return f"https://www.youtube.com/watch?v={video_id}"
|
return f"https://www.youtube.com/watch?v={video_id}"
|
||||||
|
|
||||||
|
|
||||||
def extract_song_title(title: str) -> str:
|
def normalize_title(title: str) -> str:
|
||||||
|
"""Normalize title for better matching."""
|
||||||
|
title = title.lower().strip()
|
||||||
|
|
||||||
|
# Remove common suffixes/prefixes
|
||||||
|
title = re.sub(r'\s*\(.*?\)', '', title) # Remove parentheticals
|
||||||
|
title = re.sub(r'\s*\[.*?\]', '', title) # Remove brackets
|
||||||
|
title = re.sub(r'\s*feat\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove feat.
|
||||||
|
title = re.sub(r'\s*ft\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove ft.
|
||||||
|
title = re.sub(r'\s*w/\s+.*$', '', title) # Remove w/ collaborators
|
||||||
|
title = re.sub(r'\s*[-–—]\s*$', '', title) # Trailing dashes
|
||||||
|
|
||||||
|
# Normalize characters
|
||||||
|
title = title.replace('&', 'and')
|
||||||
|
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
|
||||||
|
title = re.sub(r'\s+', ' ', title) # Collapse whitespace
|
||||||
|
|
||||||
|
return title.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_song_title(raw_title: str) -> str:
|
||||||
"""Extract the actual song title from YouTube video title."""
|
"""Extract the actual song title from YouTube video title."""
|
||||||
|
title = raw_title
|
||||||
|
|
||||||
# Remove common prefixes
|
# Remove common prefixes
|
||||||
title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)
|
title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)
|
||||||
|
|
||||||
# Remove date patterns at end (e.g., "- 12/13/25 Providence, RI")
|
# Remove date patterns (e.g., "- 12/13/25 Providence, RI")
|
||||||
title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)
|
title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)
|
||||||
|
|
||||||
# Remove "Live at..." suffix
|
# Remove "Live at..." suffix
|
||||||
|
|
@ -29,14 +51,64 @@ def extract_song_title(title: str) -> str:
|
||||||
title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)
|
title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)
|
||||||
|
|
||||||
# Remove "(4K HDR)" etc
|
# Remove "(4K HDR)" etc
|
||||||
title = re.sub(r'\s*\(4K\s*HDR\)', '', title, flags=re.IGNORECASE)
|
title = re.sub(r'\s*\(4K\s*HDR?\)', '', title, flags=re.IGNORECASE)
|
||||||
|
|
||||||
# Remove "Set I Opener" etc
|
# Remove "Set I/II Opener" etc
|
||||||
title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)
|
title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# Remove "Live from..." suffix
|
||||||
|
title = re.sub(r'\s*Live from.*$', '', title, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# Remove date at start (e.g., "9/20/2025")
|
||||||
|
title = re.sub(r'^\d{1,2}/\d{1,2}/\d{2,4}\s*', '', title)
|
||||||
|
|
||||||
|
# Remove location suffix (e.g., "Providence, RI")
|
||||||
|
title = re.sub(r'\s*[-–—]?\s*[A-Z][a-z]+,?\s*[A-Z]{2}\s*$', '', title)
|
||||||
|
|
||||||
return title.strip()
|
return title.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def find_song_match(session, song_title: str, all_songs: list) -> Song:
|
||||||
|
"""Try multiple matching strategies to find a song."""
|
||||||
|
normalized_search = normalize_title(song_title)
|
||||||
|
|
||||||
|
# Strategy 1: Exact match (case insensitive)
|
||||||
|
for song in all_songs:
|
||||||
|
if song.title.lower() == song_title.lower():
|
||||||
|
return song
|
||||||
|
|
||||||
|
# Strategy 2: Normalized exact match
|
||||||
|
for song in all_songs:
|
||||||
|
if normalize_title(song.title) == normalized_search:
|
||||||
|
return song
|
||||||
|
|
||||||
|
# Strategy 3: Starts with (for songs with suffixes in DB)
|
||||||
|
for song in all_songs:
|
||||||
|
if normalize_title(song.title).startswith(normalized_search):
|
||||||
|
return song
|
||||||
|
if normalized_search.startswith(normalize_title(song.title)):
|
||||||
|
return song
|
||||||
|
|
||||||
|
# Strategy 4: Contains (substring match)
|
||||||
|
for song in all_songs:
|
||||||
|
norm_song = normalize_title(song.title)
|
||||||
|
if len(normalized_search) >= 4: # Avoid short false positives
|
||||||
|
if normalized_search in norm_song or norm_song in normalized_search:
|
||||||
|
return song
|
||||||
|
|
||||||
|
# Strategy 5: Word overlap (for complex titles)
|
||||||
|
search_words = set(normalized_search.split())
|
||||||
|
if len(search_words) >= 2: # Only for multi-word titles
|
||||||
|
for song in all_songs:
|
||||||
|
song_words = set(normalize_title(song.title).split())
|
||||||
|
# If most words match
|
||||||
|
overlap = len(search_words & song_words)
|
||||||
|
if overlap >= len(search_words) * 0.7:
|
||||||
|
return song
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def import_videos():
|
def import_videos():
|
||||||
"""Import video links into the database."""
|
"""Import video links into the database."""
|
||||||
with open("youtube_videos.json", 'r') as f:
|
with open("youtube_videos.json", 'r') as f:
|
||||||
|
|
@ -45,15 +117,19 @@ def import_videos():
|
||||||
stats = {
|
stats = {
|
||||||
'songs_matched': 0,
|
'songs_matched': 0,
|
||||||
'songs_not_found': 0,
|
'songs_not_found': 0,
|
||||||
|
'songs_not_found_titles': [],
|
||||||
'sequences_processed': 0,
|
'sequences_processed': 0,
|
||||||
'full_shows_matched': 0,
|
'full_shows_matched': 0,
|
||||||
'full_shows_not_found': 0,
|
|
||||||
'no_date': 0,
|
'no_date': 0,
|
||||||
'skipped': 0,
|
'skipped': 0,
|
||||||
'show_not_found': 0
|
'show_not_found': 0
|
||||||
}
|
}
|
||||||
|
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
|
# Pre-load all songs for faster matching
|
||||||
|
all_songs = session.exec(select(Song)).all()
|
||||||
|
print(f"Loaded {len(all_songs)} songs from database")
|
||||||
|
|
||||||
for video in videos:
|
for video in videos:
|
||||||
video_id = video.get('videoId')
|
video_id = video.get('videoId')
|
||||||
raw_title = video.get('title', '')
|
raw_title = video.get('title', '')
|
||||||
|
|
@ -66,7 +142,7 @@ def import_videos():
|
||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip videos without dates (can't match to show)
|
# Skip videos without dates
|
||||||
if not date_str:
|
if not date_str:
|
||||||
stats['no_date'] += 1
|
stats['no_date'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
@ -87,31 +163,27 @@ def import_videos():
|
||||||
stats['show_not_found'] += 1
|
stats['show_not_found'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Handle full shows - link to Show entity
|
# Handle full shows
|
||||||
if video_type == 'full_show':
|
if video_type == 'full_show':
|
||||||
show.youtube_link = youtube_url
|
show.youtube_link = youtube_url
|
||||||
session.add(show)
|
session.add(show)
|
||||||
stats['full_shows_matched'] += 1
|
stats['full_shows_matched'] += 1
|
||||||
print(f"[FULL SHOW] {date_str}: {raw_title[:50]}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract song title
|
# Extract song title
|
||||||
song_title = extract_song_title(raw_title)
|
song_title = extract_song_title(raw_title)
|
||||||
|
|
||||||
# Handle sequences (multiple songs with →)
|
# Handle sequences
|
||||||
if video_type == 'sequence' or '→' in song_title:
|
if video_type == 'sequence' or '→' in song_title or '>' in song_title:
|
||||||
song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
|
song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
|
||||||
matched_any = False
|
matched_any = False
|
||||||
|
|
||||||
for title in song_titles:
|
for title in song_titles:
|
||||||
if not title:
|
if not title or len(title) < 2:
|
||||||
continue
|
continue
|
||||||
# Find song by title (case insensitive partial match)
|
|
||||||
songs = session.exec(
|
|
||||||
select(Song).where(Song.title.ilike(f"%{title}%"))
|
|
||||||
).all()
|
|
||||||
|
|
||||||
for song in songs:
|
song = find_song_match(session, title, all_songs)
|
||||||
|
if song:
|
||||||
perf = session.exec(
|
perf = session.exec(
|
||||||
select(Performance).where(
|
select(Performance).where(
|
||||||
Performance.show_id == show.id,
|
Performance.show_id == show.id,
|
||||||
|
|
@ -123,21 +195,18 @@ def import_videos():
|
||||||
perf.youtube_link = youtube_url
|
perf.youtube_link = youtube_url
|
||||||
session.add(perf)
|
session.add(perf)
|
||||||
matched_any = True
|
matched_any = True
|
||||||
print(f"[SEQ] {date_str}: {title} -> Perf {perf.id}")
|
|
||||||
|
|
||||||
if matched_any:
|
if matched_any:
|
||||||
stats['sequences_processed'] += 1
|
stats['sequences_processed'] += 1
|
||||||
else:
|
else:
|
||||||
stats['songs_not_found'] += 1
|
stats['songs_not_found'] += 1
|
||||||
|
stats['songs_not_found_titles'].append(f"SEQ: {song_title}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Single song - find and link
|
# Single song matching
|
||||||
songs = session.exec(
|
song = find_song_match(session, song_title, all_songs)
|
||||||
select(Song).where(Song.title.ilike(f"%{song_title}%"))
|
|
||||||
).all()
|
|
||||||
|
|
||||||
matched = False
|
if song:
|
||||||
for song in songs:
|
|
||||||
perf = session.exec(
|
perf = session.exec(
|
||||||
select(Performance).where(
|
select(Performance).where(
|
||||||
Performance.show_id == show.id,
|
Performance.show_id == show.id,
|
||||||
|
|
@ -148,25 +217,39 @@ def import_videos():
|
||||||
if perf:
|
if perf:
|
||||||
perf.youtube_link = youtube_url
|
perf.youtube_link = youtube_url
|
||||||
session.add(perf)
|
session.add(perf)
|
||||||
matched = True
|
|
||||||
stats['songs_matched'] += 1
|
stats['songs_matched'] += 1
|
||||||
print(f"[SONG] {date_str}: {song_title} -> Perf {perf.id}")
|
else:
|
||||||
break
|
# Song exists but wasn't played at this show
|
||||||
|
|
||||||
if not matched:
|
|
||||||
stats['songs_not_found'] += 1
|
stats['songs_not_found'] += 1
|
||||||
|
stats['songs_not_found_titles'].append(f"{date_str}: {song_title} (song exists, no perf)")
|
||||||
|
else:
|
||||||
|
stats['songs_not_found'] += 1
|
||||||
|
stats['songs_not_found_titles'].append(f"{date_str}: {song_title}")
|
||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
print("\n" + "="*50)
|
print("\n" + "="*50)
|
||||||
print("IMPORT SUMMARY")
|
print("IMPORT SUMMARY")
|
||||||
print("="*50)
|
print("="*50)
|
||||||
for key, value in stats.items():
|
print(f" songs_matched: {stats['songs_matched']}")
|
||||||
print(f" {key}: {value}")
|
print(f" sequences_processed: {stats['sequences_processed']}")
|
||||||
|
print(f" full_shows_matched: {stats['full_shows_matched']}")
|
||||||
|
print(f" songs_not_found: {stats['songs_not_found']}")
|
||||||
|
print(f" no_date: {stats['no_date']}")
|
||||||
|
print(f" skipped: {stats['skipped']}")
|
||||||
|
print(f" show_not_found: {stats['show_not_found']}")
|
||||||
|
|
||||||
total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
|
total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
|
||||||
print(f"\n TOTAL LINKED: {total_linked}")
|
print(f"\n TOTAL LINKED: {total_linked}")
|
||||||
|
|
||||||
|
# Show some unmatched titles for debugging
|
||||||
|
if stats['songs_not_found_titles']:
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("SAMPLE UNMATCHED (first 20):")
|
||||||
|
print("="*50)
|
||||||
|
for title in stats['songs_not_found_titles'][:20]:
|
||||||
|
print(f" - {title}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import_videos()
|
import_videos()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue