fix: Rewrite youtube parser to handle escaped markdown line-by-line

This commit is contained in:
fullsizemalt 2025-12-21 22:35:41 -08:00
parent 98a7965c52
commit 823c6e7dee

View file

@ -18,33 +18,38 @@ def parse_youtube_md(filepath: str) -> list:
with open(filepath, 'r') as f: with open(filepath, 'r') as f:
content = f.read() content = f.read()
# Clean up escaped characters from markdown first # Parse line-by-line looking for JSON objects
content = content.replace('\\`', '`').replace('\\-', '-').replace('\\_', '_') # This handles the escaped markdown format in the source file
videos = []
in_json = False
# Find JSON block (between ```json and ```) for line in content.split('\n'):
match = re.search(r'```json\s*\n?\s*(\[.*)', content, re.DOTALL) line = line.strip()
if not match:
print("No JSON block found in file.") # Detect start of JSON block (with or without escapes)
return [] if 'json' in line.lower() and ('`' in line or '\\' in line):
in_json = True
json_str = match.group(1) continue
# Find the array - it may not be closed properly, so we find opening [ and match to end
# Try to parse as much valid JSON as possible # Skip array markers
try: if line in ['[', '\\[', ']']:
# Try to find a complete JSON array continue
return json.loads(json_str.split('```')[0].strip())
except json.JSONDecodeError: # Process JSON objects
# If that fails, try line-by-line parsing if in_json and line.startswith('{'):
lines = json_str.split('\n') # Clean the line
videos = [] clean_line = line.rstrip(',').rstrip()
for line in lines: # Remove trailing markdown escapes
line = line.strip().rstrip(',') clean_line = clean_line.replace('\\_', '_').replace('\\-', '-')
if line.startswith('{') and line.endswith('}'):
if clean_line.endswith('}'):
try: try:
videos.append(json.loads(line)) obj = json.loads(clean_line)
except json.JSONDecodeError: videos.append(obj)
continue except json.JSONDecodeError as e:
return videos print(f"Parse error on line: {clean_line[:50]}... - {e}")
return videos
def normalize_title(title: str) -> str: def normalize_title(title: str) -> str:
"""Normalize song title for matching.""" """Normalize song title for matching."""