fix: Handle escaped markdown in youtube import parser

This commit is contained in:
fullsizemalt 2025-12-21 22:33:51 -08:00
parent 8620841932
commit 98a7965c52

View file

@ -18,21 +18,33 @@ def parse_youtube_md(filepath: str) -> list:
with open(filepath, 'r') as f: with open(filepath, 'r') as f:
content = f.read() content = f.read()
# Clean up escaped characters from markdown first
content = content.replace('\\`', '`').replace('\\-', '-').replace('\\_', '_')
# Find JSON block (between ```json and ```) # Find JSON block (between ```json and ```)
match = re.search(r'```json\s*\n?\s*(\[.*?\])', content, re.DOTALL) match = re.search(r'```json\s*\n?\s*(\[.*)', content, re.DOTALL)
if not match: if not match:
print("No JSON block found in file.") print("No JSON block found in file.")
return [] return []
json_str = match.group(1) json_str = match.group(1)
# Clean up escaped characters from markdown # Find the array - it may not be closed properly, so we find opening [ and match to end
json_str = json_str.replace('\\-', '-').replace('\\_', '_') # Try to parse as much valid JSON as possible
try: try:
return json.loads(json_str) # Try to find a complete JSON array
except json.JSONDecodeError as e: return json.loads(json_str.split('```')[0].strip())
print(f"JSON parse error: {e}") except json.JSONDecodeError:
return [] # If that fails, try line-by-line parsing
lines = json_str.split('\n')
videos = []
for line in lines:
line = line.strip().rstrip(',')
if line.startswith('{') and line.endswith('}'):
try:
videos.append(json.loads(line))
except json.JSONDecodeError:
continue
return videos
def normalize_title(title: str) -> str: def normalize_title(title: str) -> str:
"""Normalize song title for matching.""" """Normalize song title for matching."""