- Create Puppeteer-based scraper for morethanadiagnosis.org - Extract full page structure, content, navigation, and images - Generate JSON output with 13 headings, 24 paragraphs, 22 CTAs, 34 links, 15 images - Add comprehensive handoff doc with implementation guide for frontend - Document all website sections: Happy Mail, Support, Podcast, Resources, Shop - Include content themes and recommendations for Next.js components
166 lines
4.7 KiB
JavaScript
166 lines
4.7 KiB
JavaScript
const puppeteer = require('puppeteer');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
/**
|
|
* Scrapes morethanadiagnosis.org to extract page content and structure
|
|
*/
|
|
async function scrapeWebsite() {
|
|
let browser;
|
|
try {
|
|
console.log('🚀 Starting Puppeteer scraper...');
|
|
|
|
// Launch browser
|
|
browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
|
|
// Set user agent to avoid detection
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
|
|
|
// Set viewport
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
console.log('📄 Loading morethanadiagnosis.org...');
|
|
|
|
// Navigate to the site with longer timeout
|
|
await page.goto('https://morethanadiagnosis.org', {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000
|
|
});
|
|
|
|
console.log('⏳ Waiting for JavaScript to render...');
|
|
|
|
// Wait for Wix to fully load
|
|
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
|
|
console.log('📸 Extracting content...');
|
|
|
|
// Extract all visible text
|
|
const content = await page.evaluate(() => {
|
|
const data = {
|
|
title: document.title,
|
|
url: window.location.href,
|
|
headings: [],
|
|
paragraphs: [],
|
|
buttons: [],
|
|
links: [],
|
|
images: [],
|
|
fullText: document.body.innerText,
|
|
sections: []
|
|
};
|
|
|
|
// Extract headings
|
|
document.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach(el => {
|
|
const text = el.textContent.trim();
|
|
if (text) {
|
|
data.headings.push({
|
|
level: el.tagName,
|
|
text: text
|
|
});
|
|
}
|
|
});
|
|
|
|
// Extract paragraphs
|
|
document.querySelectorAll('p').forEach(el => {
|
|
const text = el.textContent.trim();
|
|
if (text && text.length > 20) {
|
|
data.paragraphs.push(text);
|
|
}
|
|
});
|
|
|
|
// Extract buttons and CTAs
|
|
document.querySelectorAll('button, a[role="button"], [class*="btn"], [class*="button"]').forEach(el => {
|
|
const text = el.textContent.trim();
|
|
if (text) {
|
|
data.buttons.push({
|
|
text: text,
|
|
href: el.href || '',
|
|
class: el.className
|
|
});
|
|
}
|
|
});
|
|
|
|
// Extract links
|
|
document.querySelectorAll('a[href]').forEach(el => {
|
|
const text = el.textContent.trim();
|
|
const href = el.href;
|
|
if (text && !href.includes('javascript:') && text.length < 100) {
|
|
data.links.push({
|
|
text: text,
|
|
href: href
|
|
});
|
|
}
|
|
});
|
|
|
|
// Extract images
|
|
document.querySelectorAll('img[src]').forEach(el => {
|
|
data.images.push({
|
|
src: el.src,
|
|
alt: el.alt,
|
|
title: el.title
|
|
});
|
|
});
|
|
|
|
// Extract major sections
|
|
document.querySelectorAll('section, [class*="section"], main, [role="main"]').forEach(el => {
|
|
const heading = el.querySelector('h1, h2, h3');
|
|
if (heading) {
|
|
data.sections.push({
|
|
heading: heading.textContent.trim(),
|
|
content: el.textContent.trim().substring(0, 500)
|
|
});
|
|
}
|
|
});
|
|
|
|
return data;
|
|
});
|
|
|
|
console.log('💾 Saving extracted content...');
|
|
|
|
// Save to file
|
|
const outputPath = '/srv/containers/mtad-api/website_content.json';
|
|
fs.writeFileSync(outputPath, JSON.stringify(content, null, 2));
|
|
|
|
console.log(`\n✅ Scraping complete!\n`);
|
|
console.log(`📊 Content Summary:`);
|
|
console.log(` - Headings: ${content.headings.length}`);
|
|
console.log(` - Paragraphs: ${content.paragraphs.length}`);
|
|
console.log(` - Buttons/CTAs: ${content.buttons.length}`);
|
|
console.log(` - Links: ${content.links.length}`);
|
|
console.log(` - Images: ${content.images.length}`);
|
|
console.log(` - Sections: ${content.sections.length}`);
|
|
console.log(`\n💾 Full content saved to: ${outputPath}`);
|
|
|
|
// Print preview
|
|
console.log(`\n🔍 Preview (first 2000 chars):\n`);
|
|
console.log(content.fullText.substring(0, 2000));
|
|
console.log(`\n...\n`);
|
|
|
|
// Print navigation if found
|
|
if (content.links.length > 0) {
|
|
console.log(`📍 Navigation Links Found:`);
|
|
content.links
|
|
.filter(l => !l.href.includes('#') && l.text.length < 50)
|
|
.slice(0, 10)
|
|
.forEach(l => {
|
|
console.log(` - ${l.text}: ${l.href}`);
|
|
});
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error('❌ Scraping failed:', error.message);
|
|
process.exit(1);
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
console.log('\n🔒 Browser closed.');
|
|
}
|
|
}
|
|
}
|
|
|
|
// Run the scraper
|
|
scrapeWebsite();
|