fix: Rewrite youtube parser to handle escaped markdown line-by-line
This commit is contained in:
parent
98a7965c52
commit
823c6e7dee
1 changed files with 30 additions and 25 deletions
|
|
@ -18,33 +18,38 @@ def parse_youtube_md(filepath: str) -> list:
|
||||||
with open(filepath, 'r') as f:
|
with open(filepath, 'r') as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
# Clean up escaped characters from markdown first
|
# Parse line-by-line looking for JSON objects
|
||||||
content = content.replace('\\`', '`').replace('\\-', '-').replace('\\_', '_')
|
# This handles the escaped markdown format in the source file
|
||||||
|
videos = []
|
||||||
|
in_json = False
|
||||||
|
|
||||||
# Find JSON block (between ```json and ```)
|
for line in content.split('\n'):
|
||||||
match = re.search(r'```json\s*\n?\s*(\[.*)', content, re.DOTALL)
|
line = line.strip()
|
||||||
if not match:
|
|
||||||
print("No JSON block found in file.")
|
|
||||||
return []
|
|
||||||
|
|
||||||
json_str = match.group(1)
|
# Detect start of JSON block (with or without escapes)
|
||||||
# Find the array - it may not be closed properly, so we find opening [ and match to end
|
if 'json' in line.lower() and ('`' in line or '\\' in line):
|
||||||
# Try to parse as much valid JSON as possible
|
in_json = True
|
||||||
try:
|
continue
|
||||||
# Try to find a complete JSON array
|
|
||||||
return json.loads(json_str.split('```')[0].strip())
|
# Skip array markers
|
||||||
except json.JSONDecodeError:
|
if line in ['[', '\\[', ']']:
|
||||||
# If that fails, try line-by-line parsing
|
continue
|
||||||
lines = json_str.split('\n')
|
|
||||||
videos = []
|
# Process JSON objects
|
||||||
for line in lines:
|
if in_json and line.startswith('{'):
|
||||||
line = line.strip().rstrip(',')
|
# Clean the line
|
||||||
if line.startswith('{') and line.endswith('}'):
|
clean_line = line.rstrip(',').rstrip()
|
||||||
|
# Remove trailing markdown escapes
|
||||||
|
clean_line = clean_line.replace('\\_', '_').replace('\\-', '-')
|
||||||
|
|
||||||
|
if clean_line.endswith('}'):
|
||||||
try:
|
try:
|
||||||
videos.append(json.loads(line))
|
obj = json.loads(clean_line)
|
||||||
except json.JSONDecodeError:
|
videos.append(obj)
|
||||||
continue
|
except json.JSONDecodeError as e:
|
||||||
return videos
|
print(f"Parse error on line: {clean_line[:50]}... - {e}")
|
||||||
|
|
||||||
|
return videos
|
||||||
|
|
||||||
def normalize_title(title: str) -> str:
|
def normalize_title(title: str) -> str:
|
||||||
"""Normalize song title for matching."""
|
"""Normalize song title for matching."""
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue