elmeg-demo/backend/import_youtube.py

255 lines
9.1 KiB
Python

"""
YouTube Video Import Script v3
Improved title matching with fuzzy logic and normalization.
"""
import json
import re
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Performance, Show, Song
def make_youtube_url(video_id: str) -> str:
return f"https://www.youtube.com/watch?v={video_id}"
def normalize_title(title: str) -> str:
"""Normalize title for better matching."""
title = title.lower().strip()
# Remove common suffixes/prefixes
title = re.sub(r'\s*\(.*?\)', '', title) # Remove parentheticals
title = re.sub(r'\s*\[.*?\]', '', title) # Remove brackets
title = re.sub(r'\s*feat\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove feat.
title = re.sub(r'\s*ft\.?\s+.*$', '', title, flags=re.IGNORECASE) # Remove ft.
title = re.sub(r'\s*w/\s+.*$', '', title) # Remove w/ collaborators
title = re.sub(r'\s*[-–—]\s*$', '', title) # Trailing dashes
# Normalize characters
title = title.replace('&', 'and')
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
title = re.sub(r'\s+', ' ', title) # Collapse whitespace
return title.strip()
def extract_song_title(raw_title: str) -> str:
"""Extract the actual song title from YouTube video title."""
title = raw_title
# Remove common prefixes
title = re.sub(r'^Goose\s*[-–—]\s*', '', title, flags=re.IGNORECASE)
# Remove date patterns (e.g., "- 12/13/25 Providence, RI")
title = re.sub(r'\s*[-–—]\s*\d{1,2}/\d{1,2}/\d{2,4}.*$', '', title)
# Remove "Live at..." suffix
title = re.sub(r'\s*[-–—]\s*Live at.*$', '', title, flags=re.IGNORECASE)
# Remove "(Official Audio)" etc
title = re.sub(r'\s*\(Official\s*(Audio|Video|Visualizer)\)', '', title, flags=re.IGNORECASE)
# Remove "(4K HDR)" etc
title = re.sub(r'\s*\(4K\s*HDR?\)', '', title, flags=re.IGNORECASE)
# Remove "Set I/II Opener" etc
title = re.sub(r'\s*Set\s*(I|II|1|2)?\s*Opener.*$', '', title, flags=re.IGNORECASE)
# Remove "Live from..." suffix
title = re.sub(r'\s*Live from.*$', '', title, flags=re.IGNORECASE)
# Remove date at start (e.g., "9/20/2025")
title = re.sub(r'^\d{1,2}/\d{1,2}/\d{2,4}\s*', '', title)
# Remove location suffix (e.g., "Providence, RI")
title = re.sub(r'\s*[-–—]?\s*[A-Z][a-z]+,?\s*[A-Z]{2}\s*$', '', title)
return title.strip()
def find_song_match(session, song_title: str, all_songs: list) -> Song:
"""Try multiple matching strategies to find a song."""
normalized_search = normalize_title(song_title)
# Strategy 1: Exact match (case insensitive)
for song in all_songs:
if song.title.lower() == song_title.lower():
return song
# Strategy 2: Normalized exact match
for song in all_songs:
if normalize_title(song.title) == normalized_search:
return song
# Strategy 3: Starts with (for songs with suffixes in DB)
for song in all_songs:
if normalize_title(song.title).startswith(normalized_search):
return song
if normalized_search.startswith(normalize_title(song.title)):
return song
# Strategy 4: Contains (substring match)
for song in all_songs:
norm_song = normalize_title(song.title)
if len(normalized_search) >= 4: # Avoid short false positives
if normalized_search in norm_song or norm_song in normalized_search:
return song
# Strategy 5: Word overlap (for complex titles)
search_words = set(normalized_search.split())
if len(search_words) >= 2: # Only for multi-word titles
for song in all_songs:
song_words = set(normalize_title(song.title).split())
# If most words match
overlap = len(search_words & song_words)
if overlap >= len(search_words) * 0.7:
return song
return None
def import_videos():
"""Import video links into the database."""
with open("youtube_videos.json", 'r') as f:
videos = json.load(f)
stats = {
'songs_matched': 0,
'songs_not_found': 0,
'songs_not_found_titles': [],
'sequences_processed': 0,
'full_shows_matched': 0,
'no_date': 0,
'skipped': 0,
'show_not_found': 0
}
with Session(engine) as session:
# Pre-load all songs for faster matching
all_songs = session.exec(select(Song)).all()
print(f"Loaded {len(all_songs)} songs from database")
for video in videos:
video_id = video.get('videoId')
raw_title = video.get('title', '')
video_type = video.get('type', 'song')
date_str = video.get('date')
youtube_url = make_youtube_url(video_id)
# Skip non-performance content
if video_type in ('documentary', 'visualizer', 'session'):
stats['skipped'] += 1
continue
# Skip videos without dates
if not date_str:
stats['no_date'] += 1
continue
# Parse date
try:
show_date = datetime.strptime(date_str, '%Y-%m-%d')
except ValueError:
stats['no_date'] += 1
continue
# Find show by date
show = session.exec(
select(Show).where(Show.date == show_date)
).first()
if not show:
stats['show_not_found'] += 1
continue
# Handle full shows
if video_type == 'full_show':
show.youtube_link = youtube_url
session.add(show)
stats['full_shows_matched'] += 1
continue
# Extract song title
song_title = extract_song_title(raw_title)
# Handle sequences
if video_type == 'sequence' or '' in song_title or '>' in song_title:
song_titles = [s.strip() for s in re.split(r'[→>]', song_title)]
matched_any = False
for title in song_titles:
if not title or len(title) < 2:
continue
song = find_song_match(session, title, all_songs)
if song:
perf = session.exec(
select(Performance).where(
Performance.show_id == show.id,
Performance.song_id == song.id
)
).first()
if perf:
perf.youtube_link = youtube_url
session.add(perf)
matched_any = True
if matched_any:
stats['sequences_processed'] += 1
else:
stats['songs_not_found'] += 1
stats['songs_not_found_titles'].append(f"SEQ: {song_title}")
continue
# Single song matching
song = find_song_match(session, song_title, all_songs)
if song:
perf = session.exec(
select(Performance).where(
Performance.show_id == show.id,
Performance.song_id == song.id
)
).first()
if perf:
perf.youtube_link = youtube_url
session.add(perf)
stats['songs_matched'] += 1
else:
# Song exists but wasn't played at this show
stats['songs_not_found'] += 1
stats['songs_not_found_titles'].append(f"{date_str}: {song_title} (song exists, no perf)")
else:
stats['songs_not_found'] += 1
stats['songs_not_found_titles'].append(f"{date_str}: {song_title}")
session.commit()
print("\n" + "="*50)
print("IMPORT SUMMARY")
print("="*50)
print(f" songs_matched: {stats['songs_matched']}")
print(f" sequences_processed: {stats['sequences_processed']}")
print(f" full_shows_matched: {stats['full_shows_matched']}")
print(f" songs_not_found: {stats['songs_not_found']}")
print(f" no_date: {stats['no_date']}")
print(f" skipped: {stats['skipped']}")
print(f" show_not_found: {stats['show_not_found']}")
total_linked = stats['songs_matched'] + stats['sequences_processed'] + stats['full_shows_matched']
print(f"\n TOTAL LINKED: {total_linked}")
# Show some unmatched titles for debugging
if stats['songs_not_found_titles']:
print("\n" + "="*50)
print("SAMPLE UNMATCHED (first 20):")
print("="*50)
for title in stats['songs_not_found_titles'][:20]:
print(f" - {title}")
if __name__ == "__main__":
import_videos()