fix: correct Bandcamp regex pattern
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
ca28293cea
commit
26171e1937
1 changed files with 6 additions and 5 deletions
|
|
@ -33,18 +33,19 @@ def scrape_bandcamp_catalog() -> dict:
|
|||
|
||||
html = resp.text
|
||||
|
||||
# Extract album URLs and parse dates from them
|
||||
# Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-...
|
||||
url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+'
|
||||
# Extract album URLs - pattern: album/YYYY-MM-DD-rest-of-slug
|
||||
# The URLs appear as relative paths like: album/2025-12-13-goosemas-show-upon-time...
|
||||
url_pattern = r'album/(\d{4})-(\d{2})-(\d{2})-[a-z0-9-]+'
|
||||
|
||||
for match in re.finditer(url_pattern, html):
|
||||
url = match.group(0)
|
||||
album_path = match.group(0)
|
||||
year, month, day = match.groups()
|
||||
date_str = f"{year}-{month}-{day}"
|
||||
full_url = f"https://goosetheband.bandcamp.com/{album_path}"
|
||||
|
||||
# Keep the first URL for each date (in case of duplicates)
|
||||
if date_str not in albums:
|
||||
albums[date_str] = url
|
||||
albums[date_str] = full_url
|
||||
|
||||
print(f"Scraped {len(albums)} Bandcamp album URLs")
|
||||
return albums
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue