feat: add Bandcamp catalog import script
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run

This commit is contained in:
fullsizemalt 2025-12-24 14:21:11 -08:00
parent d54b217264
commit ca28293cea

View file

@ -0,0 +1,113 @@
"""
Bandcamp Catalog Import Script
Scrapes the Goose Bandcamp discography page to get all album URLs,
then matches them to shows by date and updates the database.
Run: python import_bandcamp_catalog.py [--save]
"""
import re
import requests
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Show
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
def scrape_bandcamp_catalog() -> dict:
"""
Scrape the Goose Bandcamp music page for all album URLs.
Returns dict mapping date (YYYY-MM-DD) -> URL
"""
music_url = "https://goosetheband.bandcamp.com/music"
albums = {}
try:
resp = requests.get(music_url, headers=HEADERS, timeout=30)
if resp.status_code != 200:
print(f"Failed to fetch Bandcamp catalog: {resp.status_code}")
return {}
html = resp.text
# Extract album URLs and parse dates from them
# Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-...
url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+'
for match in re.finditer(url_pattern, html):
url = match.group(0)
year, month, day = match.groups()
date_str = f"{year}-{month}-{day}"
# Keep the first URL for each date (in case of duplicates)
if date_str not in albums:
albums[date_str] = url
print(f"Scraped {len(albums)} Bandcamp album URLs")
return albums
except Exception as e:
print(f"Error scraping Bandcamp: {e}")
return {}
def main(dry_run: bool = True):
print("=" * 60)
print("Bandcamp Catalog Import")
print("=" * 60)
# Scrape catalog
bandcamp_albums = scrape_bandcamp_catalog()
if not bandcamp_albums:
print("No albums found, exiting.")
return
with Session(engine) as session:
# Get all shows
shows = session.exec(select(Show)).all()
print(f"Found {len(shows)} shows in database")
updated = 0
already_had = 0
not_found = 0
for show in shows:
date_str = show.date.strftime("%Y-%m-%d")
# Check if already has a proper album link
if show.bandcamp_link and "/album/" in show.bandcamp_link:
already_had += 1
continue
# Look up in catalog
if date_str in bandcamp_albums:
new_url = bandcamp_albums[date_str]
print(f"{date_str} -> {new_url}")
updated += 1
if not dry_run:
show.bandcamp_link = new_url
session.add(show)
else:
not_found += 1
if not dry_run:
session.commit()
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f"Shows with new Bandcamp link: {updated}")
print(f"Shows already had album link: {already_had}")
print(f"Shows not found in Bandcamp: {not_found}")
if dry_run:
print("\n[DRY RUN - Run with --save to commit changes]")
if __name__ == "__main__":
import sys
dry_run = "--save" not in sys.argv
main(dry_run=dry_run)