elmeg-demo/backend/import_bandcamp_catalog.py
fullsizemalt 26171e1937
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
fix: correct Bandcamp regex pattern
2025-12-24 14:23:07 -08:00

114 lines
3.5 KiB
Python

"""
Bandcamp Catalog Import Script
Scrapes the Goose Bandcamp discography page to get all album URLs,
then matches them to shows by date and updates the database.
Run: python import_bandcamp_catalog.py [--save]
"""
import re
import requests
from datetime import datetime
from sqlmodel import Session, select
from database import engine
from models import Show
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
def scrape_bandcamp_catalog() -> dict:
"""
Scrape the Goose Bandcamp music page for all album URLs.
Returns dict mapping date (YYYY-MM-DD) -> URL
"""
music_url = "https://goosetheband.bandcamp.com/music"
albums = {}
try:
resp = requests.get(music_url, headers=HEADERS, timeout=30)
if resp.status_code != 200:
print(f"Failed to fetch Bandcamp catalog: {resp.status_code}")
return {}
html = resp.text
# Extract album URLs - pattern: album/YYYY-MM-DD-rest-of-slug
# The URLs appear as relative paths like: album/2025-12-13-goosemas-show-upon-time...
url_pattern = r'album/(\d{4})-(\d{2})-(\d{2})-[a-z0-9-]+'
for match in re.finditer(url_pattern, html):
album_path = match.group(0)
year, month, day = match.groups()
date_str = f"{year}-{month}-{day}"
full_url = f"https://goosetheband.bandcamp.com/{album_path}"
# Keep the first URL for each date (in case of duplicates)
if date_str not in albums:
albums[date_str] = full_url
print(f"Scraped {len(albums)} Bandcamp album URLs")
return albums
except Exception as e:
print(f"Error scraping Bandcamp: {e}")
return {}
def main(dry_run: bool = True):
print("=" * 60)
print("Bandcamp Catalog Import")
print("=" * 60)
# Scrape catalog
bandcamp_albums = scrape_bandcamp_catalog()
if not bandcamp_albums:
print("No albums found, exiting.")
return
with Session(engine) as session:
# Get all shows
shows = session.exec(select(Show)).all()
print(f"Found {len(shows)} shows in database")
updated = 0
already_had = 0
not_found = 0
for show in shows:
date_str = show.date.strftime("%Y-%m-%d")
# Check if already has a proper album link
if show.bandcamp_link and "/album/" in show.bandcamp_link:
already_had += 1
continue
# Look up in catalog
if date_str in bandcamp_albums:
new_url = bandcamp_albums[date_str]
print(f"{date_str} -> {new_url}")
updated += 1
if not dry_run:
show.bandcamp_link = new_url
session.add(show)
else:
not_found += 1
if not dry_run:
session.commit()
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f"Shows with new Bandcamp link: {updated}")
print(f"Shows already had album link: {already_had}")
print(f"Shows not found in Bandcamp: {not_found}")
if dry_run:
print("\n[DRY RUN - Run with --save to commit changes]")
if __name__ == "__main__":
import sys
dry_run = "--save" not in sys.argv
main(dry_run=dry_run)