feat: add Bandcamp catalog import script
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
d54b217264
commit
ca28293cea
1 changed files with 113 additions and 0 deletions
113
backend/import_bandcamp_catalog.py
Normal file
113
backend/import_bandcamp_catalog.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
"""
|
||||
Bandcamp Catalog Import Script
|
||||
|
||||
Scrapes the Goose Bandcamp discography page to get all album URLs,
|
||||
then matches them to shows by date and updates the database.
|
||||
|
||||
Run: python import_bandcamp_catalog.py [--save]
|
||||
"""
|
||||
import re
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from sqlmodel import Session, select
|
||||
from database import engine
|
||||
from models import Show
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
def scrape_bandcamp_catalog() -> dict:
|
||||
"""
|
||||
Scrape the Goose Bandcamp music page for all album URLs.
|
||||
Returns dict mapping date (YYYY-MM-DD) -> URL
|
||||
"""
|
||||
music_url = "https://goosetheband.bandcamp.com/music"
|
||||
albums = {}
|
||||
|
||||
try:
|
||||
resp = requests.get(music_url, headers=HEADERS, timeout=30)
|
||||
if resp.status_code != 200:
|
||||
print(f"Failed to fetch Bandcamp catalog: {resp.status_code}")
|
||||
return {}
|
||||
|
||||
html = resp.text
|
||||
|
||||
# Extract album URLs and parse dates from them
|
||||
# Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-...
|
||||
url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+'
|
||||
|
||||
for match in re.finditer(url_pattern, html):
|
||||
url = match.group(0)
|
||||
year, month, day = match.groups()
|
||||
date_str = f"{year}-{month}-{day}"
|
||||
|
||||
# Keep the first URL for each date (in case of duplicates)
|
||||
if date_str not in albums:
|
||||
albums[date_str] = url
|
||||
|
||||
print(f"Scraped {len(albums)} Bandcamp album URLs")
|
||||
return albums
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping Bandcamp: {e}")
|
||||
return {}
|
||||
|
||||
def main(dry_run: bool = True):
|
||||
print("=" * 60)
|
||||
print("Bandcamp Catalog Import")
|
||||
print("=" * 60)
|
||||
|
||||
# Scrape catalog
|
||||
bandcamp_albums = scrape_bandcamp_catalog()
|
||||
|
||||
if not bandcamp_albums:
|
||||
print("No albums found, exiting.")
|
||||
return
|
||||
|
||||
with Session(engine) as session:
|
||||
# Get all shows
|
||||
shows = session.exec(select(Show)).all()
|
||||
print(f"Found {len(shows)} shows in database")
|
||||
|
||||
updated = 0
|
||||
already_had = 0
|
||||
not_found = 0
|
||||
|
||||
for show in shows:
|
||||
date_str = show.date.strftime("%Y-%m-%d")
|
||||
|
||||
# Check if already has a proper album link
|
||||
if show.bandcamp_link and "/album/" in show.bandcamp_link:
|
||||
already_had += 1
|
||||
continue
|
||||
|
||||
# Look up in catalog
|
||||
if date_str in bandcamp_albums:
|
||||
new_url = bandcamp_albums[date_str]
|
||||
print(f"✓ {date_str} -> {new_url}")
|
||||
updated += 1
|
||||
|
||||
if not dry_run:
|
||||
show.bandcamp_link = new_url
|
||||
session.add(show)
|
||||
else:
|
||||
not_found += 1
|
||||
|
||||
if not dry_run:
|
||||
session.commit()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
print(f"Shows with new Bandcamp link: {updated}")
|
||||
print(f"Shows already had album link: {already_had}")
|
||||
print(f"Shows not found in Bandcamp: {not_found}")
|
||||
|
||||
if dry_run:
|
||||
print("\n[DRY RUN - Run with --save to commit changes]")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
dry_run = "--save" not in sys.argv
|
||||
main(dry_run=dry_run)
|
||||
Loading…
Add table
Reference in a new issue