morethanadiagnosis-hub/scraper.js
admin da63a31c95 feat: add website scraper and handoff documentation for claude-web
- Create Puppeteer-based scraper for morethanadiagnosis.org
- Extract full page structure, content, navigation, and images
- Generate JSON output with 13 headings, 24 paragraphs, 22 CTAs, 34 links, 15 images
- Add comprehensive handoff doc with implementation guide for frontend
- Document all website sections: Happy Mail, Support, Podcast, Resources, Shop
- Include content themes and recommendations for Next.js components
2025-11-18 17:17:43 +00:00

166 lines
4.7 KiB
JavaScript

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
/**
* Scrapes morethanadiagnosis.org to extract page content and structure
*/
async function scrapeWebsite() {
let browser;
try {
console.log('🚀 Starting Puppeteer scraper...');
// Launch browser
browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
// Set user agent to avoid detection
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
// Set viewport
await page.setViewport({ width: 1920, height: 1080 });
console.log('📄 Loading morethanadiagnosis.org...');
// Navigate to the site with longer timeout
await page.goto('https://morethanadiagnosis.org', {
waitUntil: 'networkidle2',
timeout: 60000
});
console.log('⏳ Waiting for JavaScript to render...');
// Wait for Wix to fully load
await new Promise(resolve => setTimeout(resolve, 3000));
console.log('📸 Extracting content...');
// Extract all visible text
const content = await page.evaluate(() => {
const data = {
title: document.title,
url: window.location.href,
headings: [],
paragraphs: [],
buttons: [],
links: [],
images: [],
fullText: document.body.innerText,
sections: []
};
// Extract headings
document.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach(el => {
const text = el.textContent.trim();
if (text) {
data.headings.push({
level: el.tagName,
text: text
});
}
});
// Extract paragraphs
document.querySelectorAll('p').forEach(el => {
const text = el.textContent.trim();
if (text && text.length > 20) {
data.paragraphs.push(text);
}
});
// Extract buttons and CTAs
document.querySelectorAll('button, a[role="button"], [class*="btn"], [class*="button"]').forEach(el => {
const text = el.textContent.trim();
if (text) {
data.buttons.push({
text: text,
href: el.href || '',
class: el.className
});
}
});
// Extract links
document.querySelectorAll('a[href]').forEach(el => {
const text = el.textContent.trim();
const href = el.href;
if (text && !href.includes('javascript:') && text.length < 100) {
data.links.push({
text: text,
href: href
});
}
});
// Extract images
document.querySelectorAll('img[src]').forEach(el => {
data.images.push({
src: el.src,
alt: el.alt,
title: el.title
});
});
// Extract major sections
document.querySelectorAll('section, [class*="section"], main, [role="main"]').forEach(el => {
const heading = el.querySelector('h1, h2, h3');
if (heading) {
data.sections.push({
heading: heading.textContent.trim(),
content: el.textContent.trim().substring(0, 500)
});
}
});
return data;
});
console.log('💾 Saving extracted content...');
// Save to file
const outputPath = '/srv/containers/mtad-api/website_content.json';
fs.writeFileSync(outputPath, JSON.stringify(content, null, 2));
console.log(`\n✅ Scraping complete!\n`);
console.log(`📊 Content Summary:`);
console.log(` - Headings: ${content.headings.length}`);
console.log(` - Paragraphs: ${content.paragraphs.length}`);
console.log(` - Buttons/CTAs: ${content.buttons.length}`);
console.log(` - Links: ${content.links.length}`);
console.log(` - Images: ${content.images.length}`);
console.log(` - Sections: ${content.sections.length}`);
console.log(`\n💾 Full content saved to: ${outputPath}`);
// Print preview
console.log(`\n🔍 Preview (first 2000 chars):\n`);
console.log(content.fullText.substring(0, 2000));
console.log(`\n...\n`);
// Print navigation if found
if (content.links.length > 0) {
console.log(`📍 Navigation Links Found:`);
content.links
.filter(l => !l.href.includes('#') && l.text.length < 50)
.slice(0, 10)
.forEach(l => {
console.log(` - ${l.text}: ${l.href}`);
});
}
} catch (error) {
console.error('❌ Scraping failed:', error.message);
process.exit(1);
} finally {
if (browser) {
await browser.close();
console.log('\n🔒 Browser closed.');
}
}
}
// Run the scraper
scrapeWebsite();