const puppeteer = require('puppeteer'); const fs = require('fs'); const path = require('path'); /** * Scrapes morethanadiagnosis.org to extract page content and structure */ async function scrapeWebsite() { let browser; try { console.log('šŸš€ Starting Puppeteer scraper...'); // Launch browser browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const page = await browser.newPage(); // Set user agent to avoid detection await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // Set viewport await page.setViewport({ width: 1920, height: 1080 }); console.log('šŸ“„ Loading morethanadiagnosis.org...'); // Navigate to the site with longer timeout await page.goto('https://morethanadiagnosis.org', { waitUntil: 'networkidle2', timeout: 60000 }); console.log('ā³ Waiting for JavaScript to render...'); // Wait for Wix to fully load await new Promise(resolve => setTimeout(resolve, 3000)); console.log('šŸ“ø Extracting content...'); // Extract all visible text const content = await page.evaluate(() => { const data = { title: document.title, url: window.location.href, headings: [], paragraphs: [], buttons: [], links: [], images: [], fullText: document.body.innerText, sections: [] }; // Extract headings document.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach(el => { const text = el.textContent.trim(); if (text) { data.headings.push({ level: el.tagName, text: text }); } }); // Extract paragraphs document.querySelectorAll('p').forEach(el => { const text = el.textContent.trim(); if (text && text.length > 20) { data.paragraphs.push(text); } }); // Extract buttons and CTAs document.querySelectorAll('button, a[role="button"], [class*="btn"], [class*="button"]').forEach(el => { const text = el.textContent.trim(); if (text) { data.buttons.push({ text: text, href: el.href || '', class: el.className }); } }); // Extract links document.querySelectorAll('a[href]').forEach(el => { const text = el.textContent.trim(); const href = el.href; if (text && !href.includes('javascript:') && text.length < 100) { data.links.push({ text: text, href: href }); } }); // Extract images document.querySelectorAll('img[src]').forEach(el => { data.images.push({ src: el.src, alt: el.alt, title: el.title }); }); // Extract major sections document.querySelectorAll('section, [class*="section"], main, [role="main"]').forEach(el => { const heading = el.querySelector('h1, h2, h3'); if (heading) { data.sections.push({ heading: heading.textContent.trim(), content: el.textContent.trim().substring(0, 500) }); } }); return data; }); console.log('šŸ’¾ Saving extracted content...'); // Save to file const outputPath = '/srv/containers/mtad-api/website_content.json'; fs.writeFileSync(outputPath, JSON.stringify(content, null, 2)); console.log(`\nāœ… Scraping complete!\n`); console.log(`šŸ“Š Content Summary:`); console.log(` - Headings: ${content.headings.length}`); console.log(` - Paragraphs: ${content.paragraphs.length}`); console.log(` - Buttons/CTAs: ${content.buttons.length}`); console.log(` - Links: ${content.links.length}`); console.log(` - Images: ${content.images.length}`); console.log(` - Sections: ${content.sections.length}`); console.log(`\nšŸ’¾ Full content saved to: ${outputPath}`); // Print preview console.log(`\nšŸ” Preview (first 2000 chars):\n`); console.log(content.fullText.substring(0, 2000)); console.log(`\n...\n`); // Print navigation if found if (content.links.length > 0) { console.log(`šŸ“ Navigation Links Found:`); content.links .filter(l => !l.href.includes('#') && l.text.length < 50) .slice(0, 10) .forEach(l => { console.log(` - ${l.text}: ${l.href}`); }); } } catch (error) { console.error('āŒ Scraping failed:', error.message); process.exit(1); } finally { if (browser) { await browser.close(); console.log('\nšŸ”’ Browser closed.'); } } } // Run the scraper scrapeWebsite();