diff --git a/backend/import_youtube.py b/backend/import_youtube.py index 645c92a..b1129c5 100644 --- a/backend/import_youtube.py +++ b/backend/import_youtube.py @@ -18,33 +18,38 @@ def parse_youtube_md(filepath: str) -> list: with open(filepath, 'r') as f: content = f.read() - # Clean up escaped characters from markdown first - content = content.replace('\\`', '`').replace('\\-', '-').replace('\\_', '_') + # Parse line-by-line looking for JSON objects + # This handles the escaped markdown format in the source file + videos = [] + in_json = False - # Find JSON block (between ```json and ```) - match = re.search(r'```json\s*\n?\s*(\[.*)', content, re.DOTALL) - if not match: - print("No JSON block found in file.") - return [] - - json_str = match.group(1) - # Find the array - it may not be closed properly, so we find opening [ and match to end - # Try to parse as much valid JSON as possible - try: - # Try to find a complete JSON array - return json.loads(json_str.split('```')[0].strip()) - except json.JSONDecodeError: - # If that fails, try line-by-line parsing - lines = json_str.split('\n') - videos = [] - for line in lines: - line = line.strip().rstrip(',') - if line.startswith('{') and line.endswith('}'): + for line in content.split('\n'): + line = line.strip() + + # Detect start of JSON block (with or without escapes) + if 'json' in line.lower() and ('`' in line or '\\' in line): + in_json = True + continue + + # Skip array markers + if line in ['[', '\\[', ']']: + continue + + # Process JSON objects + if in_json and line.startswith('{'): + # Clean the line + clean_line = line.rstrip(',').rstrip() + # Remove trailing markdown escapes + clean_line = clean_line.replace('\\_', '_').replace('\\-', '-') + + if clean_line.endswith('}'): try: - videos.append(json.loads(line)) - except json.JSONDecodeError: - continue - return videos + obj = json.loads(clean_line) + videos.append(obj) + except json.JSONDecodeError as e: + print(f"Parse error on line: {clean_line[:50]}... - {e}") + + return videos def normalize_title(title: str) -> str: """Normalize song title for matching."""