fix: correct Bandcamp regex pattern
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
Some checks are pending
Deploy Elmeg / deploy (push) Waiting to run
This commit is contained in:
parent
ca28293cea
commit
26171e1937
1 changed files with 6 additions and 5 deletions
|
|
@ -33,18 +33,19 @@ def scrape_bandcamp_catalog() -> dict:
|
||||||
|
|
||||||
html = resp.text
|
html = resp.text
|
||||||
|
|
||||||
# Extract album URLs and parse dates from them
|
# Extract album URLs - pattern: album/YYYY-MM-DD-rest-of-slug
|
||||||
# Pattern: https://goosetheband.bandcamp.com/album/YYYY-MM-DD-...
|
# The URLs appear as relative paths like: album/2025-12-13-goosemas-show-upon-time...
|
||||||
url_pattern = r'https://goosetheband\.bandcamp\.com/album/(\d{4})-(\d{2})-(\d{2})-[^"\'>\s]+'
|
url_pattern = r'album/(\d{4})-(\d{2})-(\d{2})-[a-z0-9-]+'
|
||||||
|
|
||||||
for match in re.finditer(url_pattern, html):
|
for match in re.finditer(url_pattern, html):
|
||||||
url = match.group(0)
|
album_path = match.group(0)
|
||||||
year, month, day = match.groups()
|
year, month, day = match.groups()
|
||||||
date_str = f"{year}-{month}-{day}"
|
date_str = f"{year}-{month}-{day}"
|
||||||
|
full_url = f"https://goosetheband.bandcamp.com/{album_path}"
|
||||||
|
|
||||||
# Keep the first URL for each date (in case of duplicates)
|
# Keep the first URL for each date (in case of duplicates)
|
||||||
if date_str not in albums:
|
if date_str not in albums:
|
||||||
albums[date_str] = url
|
albums[date_str] = full_url
|
||||||
|
|
||||||
print(f"Scraped {len(albums)} Bandcamp album URLs")
|
print(f"Scraped {len(albums)} Bandcamp album URLs")
|
||||||
return albums
|
return albums
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue