elmeg-demo/backend/fetch_youtube.py

"""
Fetch all videos from Goose YouTube channel using YouTube Data API v3
"""
import requests
import json
import re
from datetime import datetime

API_KEY = "AIzaSyCxDpv6HM-sPD8vPJIBffwa2-skOpEJkOU"
CHANNEL_HANDLE = "@GooseTheBand"

def get_channel_id(handle: str) -> str:
    """Get channel ID from handle."""
    url = "https://www.googleapis.com/youtube/v3/search"
    params = {
        "key": API_KEY,
        "q": handle,
        "type": "channel",
        "part": "snippet",
        "maxResults": 1
    }
    resp = requests.get(url, params=params)
    data = resp.json()
    if "items" in data and len(data["items"]) > 0:
        return data["items"][0]["snippet"]["channelId"]
    return None

def get_uploads_playlist_id(channel_id: str) -> str:
    """Get the uploads playlist ID for a channel."""
    url = "https://www.googleapis.com/youtube/v3/channels"
    params = {
        "key": API_KEY,
        "id": channel_id,
        "part": "contentDetails"
    }
    resp = requests.get(url, params=params)
    data = resp.json()
    if "items" in data and len(data["items"]) > 0:
        return data["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    return None

def get_all_videos(playlist_id: str) -> list:
    """Fetch all videos from a playlist (handles pagination)."""
    videos = []
    url = "https://www.googleapis.com/youtube/v3/playlistItems"
    next_page_token = None

    while True:
        params = {
            "key": API_KEY,
            "playlistId": playlist_id,
            "part": "snippet,contentDetails",
            "maxResults": 50
        }
        if next_page_token:
            params["pageToken"] = next_page_token

        resp = requests.get(url, params=params)
        data = resp.json()

        if "error" in data:
            print(f"API Error: {data['error']}")
            break

        for item in data.get("items", []):
            snippet = item["snippet"]
            video = {
                "videoId": snippet["resourceId"]["videoId"],
                "title": snippet["title"],
                "description": snippet.get("description", ""),
                "publishedAt": snippet["publishedAt"],
                "thumbnails": snippet.get("thumbnails", {})
            }
            videos.append(video)

        next_page_token = data.get("nextPageToken")
        print(f"Fetched {len(videos)} videos so far...")

        if not next_page_token:
            break

    return videos

def parse_video_metadata(videos: list) -> list:
    """Parse video titles to extract show date and type."""
    parsed = []

    # Date patterns to look for in titles/descriptions
    date_patterns = [
        r'(\d{1,2})[./](\d{1,2})[./](\d{2,4})',  # M/D/YY or M.D.YYYY
        r'(\d{4})-(\d{2})-(\d{2})',  # YYYY-MM-DD
    ]

    for video in videos:
        title = video["title"]
        desc = video.get("description", "")

        # Determine video type
        video_type = "song"  # default
        title_lower = title.lower()

        if "full show" in title_lower or "live at" in title_lower or "night 1" in title_lower or "night 2" in title_lower or "night 3" in title_lower:
            video_type = "full_show"
        elif "→" in title or "->" in title:
            video_type = "sequence"
        elif "documentary" in title_lower or "behind" in title_lower:
            video_type = "documentary"
        elif "visualizer" in title_lower:
            video_type = "visualizer"
        elif "session" in title_lower or "studio" in title_lower:
            video_type = "session"

        # Try to extract date
        show_date = None

        # Check description first (often has date info)
        combined_text = f"{title} {desc}"
        for pattern in date_patterns:
            match = re.search(pattern, combined_text)
            if match:
                groups = match.groups()
                try:
                    if len(groups[0]) == 4:  # YYYY-MM-DD
                        show_date = f"{groups[0]}-{groups[1]}-{groups[2]}"
                    else:  # M/D/YY
                        year = groups[2]
                        if len(year) == 2:
                            year = "20" + year if int(year) < 50 else "19" + year
                        month = groups[0].zfill(2)
                        day = groups[1].zfill(2)
                        show_date = f"{year}-{month}-{day}"
                    break
                except:
                    pass

        # Extract venue from title if possible
        venue = None
        venue_patterns = [
            r'@ (.+)$',
            r'at (.+?) -',
            r'Live at (.+)',
            r'- (.+?, [A-Z]{2})$',
        ]
        for pattern in venue_patterns:
            match = re.search(pattern, title, re.IGNORECASE)
            if match:
                venue = match.group(1).strip()
                break

        parsed.append({
            "videoId": video["videoId"],
            "title": title,
            "date": show_date,
            "venue": venue,
            "type": video_type,
            "publishedAt": video["publishedAt"]
        })

    return parsed

def main():
    print("Fetching Goose YouTube channel videos...")

    # Get channel ID
    print(f"Looking up channel: {CHANNEL_HANDLE}")
    channel_id = get_channel_id(CHANNEL_HANDLE)
    if not channel_id:
        print("Could not find channel!")
        return
    print(f"Channel ID: {channel_id}")

    # Get uploads playlist
    uploads_playlist = get_uploads_playlist_id(channel_id)
    if not uploads_playlist:
        print("Could not find uploads playlist!")
        return
    print(f"Uploads playlist: {uploads_playlist}")

    # Fetch all videos
    videos = get_all_videos(uploads_playlist)
    print(f"\nTotal videos found: {len(videos)}")

    # Parse metadata
    parsed = parse_video_metadata(videos)

    # Save to JSON
    output_file = "youtube_videos.json"
    with open(output_file, 'w') as f:
        json.dump(parsed, f, indent=2)
    print(f"\nSaved to {output_file}")

    # Show stats
    types = {}
    dated = 0
    for v in parsed:
        types[v["type"]] = types.get(v["type"], 0) + 1
        if v["date"]:
            dated += 1

    print("\n=== Stats ===")
    print(f"Total: {len(parsed)}")
    print(f"With dates: {dated}")
    for vtype, count in sorted(types.items()):
        print(f"  {vtype}: {count}")

if __name__ == "__main__":
    main()