From 1c545c93b4f15659de8fdd0093f9bfe5b54e9078 Mon Sep 17 00:00:00 2001 From: Dennis Date: Fri, 6 Mar 2026 07:44:32 +0100 Subject: [PATCH] feat: production hardening + smart subpage scanning with layout dedup Security: - Add CRON_SECRET auth to /api/cron/* endpoints - Add admin role verification to /api/admin/* routes - Add org membership check to /api/billing/usage - Add security headers (HSTS, X-Frame-Options, CSP, etc.) - Add env variable validation at startup - Add rate limiting to backend API (30 req/min per IP) Infrastructure: - Multi-stage Dockerfiles with non-root user + healthchecks - Updated cron workflow to pass CRON_SECRET header - Updated .env.example with all optional vars Smart subpage scanning: - Crawler now computes template_hash (DOM structure without content) - Scanner scans ALL unique-layout pages, not just main page - Pages with same layout (e.g. product pages) scanned only once - Deduplication by template_hash, fallback to content_hash - Main page always scanned with high priority - Re-checks subscription limits before each page scan Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .env.example | 11 ++ website-monitoring-backend/Dockerfile | 43 +++-- website-monitoring-backend/src/index.ts | 26 +++ .../.github/workflows/cron-scan.yml | 5 +- website-monitoring-frontend/Dockerfile | 42 +++-- .../database-schema.sql | 4 + website-monitoring-frontend/next.config.ts | 17 ++ .../setup-database.sql | 1 + .../src/app/api/admin/organizations/route.ts | 8 + .../src/app/api/admin/stats/route.ts | 7 +- .../src/app/api/admin/users/route.ts | 11 ++ .../src/app/api/billing/usage/route.ts | 9 +- .../src/app/api/cron/scan/route.ts | 4 + .../src/app/api/cron/uptime/route.ts | 5 + .../src/lib/apiAuth.ts | 148 ++++++++++++++++++ .../src/lib/validateEnv.ts | 52 ++++++ .../src/services/newCrawlerService.ts | 48 ++++++ .../src/services/scanScheduler.ts | 116 ++++++++++---- 18 files changed, 498 insertions(+), 59 deletions(-) create mode 100644 website-monitoring-frontend/src/lib/apiAuth.ts create mode 100644 website-monitoring-frontend/src/lib/validateEnv.ts diff --git a/.env.example b/.env.example index 68b31a4..88b4aa2 100644 --- a/.env.example +++ b/.env.example @@ -18,3 +18,14 @@ CORS_ORIGIN=http://localhost:3000 NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key SUPABASE_SERVICE_ROLE_KEY=your-service-role-key + +# ── Security ──────────────────────────────── +# Required in production: protects /api/cron/* endpoints +CRON_SECRET=generate-a-random-secret-here + +# ── Optional Services ─────────────────────── +# Email notifications (Resend — free tier: 3000 emails/mo) +RESEND_API_KEY=re_your_resend_key + +# Lighthouse backend URL (for automated scans) +LIGHTHOUSE_SERVICE_URL=http://localhost:5000 diff --git a/website-monitoring-backend/Dockerfile b/website-monitoring-backend/Dockerfile index 0c02b3a..3046d3c 100644 --- a/website-monitoring-backend/Dockerfile +++ b/website-monitoring-backend/Dockerfile @@ -1,25 +1,36 @@ -# Use the official Node.js image. -FROM node:18 +# --- Stage 1: Build --- +FROM node:20-slim AS builder -# OPTIONAL: Falls in der Base kein Chrome enthalten ist, -# müsstest du hier noch "apt-get update" + "apt-get install chromium" oder ähnliches ausführen, -# z. B.: -RUN apt-get update && apt-get install -y chromium - -# Create and change to the app directory. WORKDIR /app -# Copy application dependency manifests to the container image. COPY package*.json ./ +RUN npm ci -# Install production dependencies. -RUN npm install - -# Copy local code to the container image. COPY . . - -# Build the TypeScript code RUN npm run build -# Run the web service on container startup. +# --- Stage 2: Production --- +FROM node:20-slim AS runtime + +RUN apt-get update && apt-get install -y --no-install-recommends chromium \ + && rm -rf /var/lib/apt/lists/* + +ENV CHROME_BIN=/usr/bin/chromium +ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true + +WORKDIR /app + +RUN groupadd -r app && useradd -r -g app -d /app app + +COPY --from=builder --chown=app:app /app/dist ./dist +COPY --from=builder --chown=app:app /app/node_modules ./node_modules +COPY --from=builder --chown=app:app /app/package.json ./ + +USER app + +EXPOSE 5000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD node -e "const h=require('http');h.get('http://localhost:5000/health',(r)=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))" + CMD ["node", "dist/index.js"] diff --git a/website-monitoring-backend/src/index.ts b/website-monitoring-backend/src/index.ts index e27ffef..7d79016 100644 --- a/website-monitoring-backend/src/index.ts +++ b/website-monitoring-backend/src/index.ts @@ -5,10 +5,36 @@ import lighthouseRouter from "./routes/lighthouse.js"; dotenv.config(); +// Rate limiting (simple in-memory for single instance) +const rateLimitMap = new Map(); +const RATE_LIMIT_WINDOW = 60_000; // 1 minute +const RATE_LIMIT_MAX = 30; // requests per window + +function rateLimit(req: Request, res: Response, next: () => void) { + const ip = req.ip || req.headers["x-forwarded-for"] || "unknown"; + const key = String(ip); + const now = Date.now(); + const entry = rateLimitMap.get(key); + + if (!entry || now > entry.resetAt) { + rateLimitMap.set(key, { count: 1, resetAt: now + RATE_LIMIT_WINDOW }); + return next(); + } + + if (entry.count >= RATE_LIMIT_MAX) { + res.status(429).json({ error: "Too many requests" }); + return; + } + + entry.count++; + next(); +} + const app = express(); app.use(cors({ origin: process.env.CORS_ORIGIN || "*" })); app.use(express.json()); +app.use(rateLimit); app.get("/health", (_req: Request, res: Response) => { res.status(200).json({ status: "ok", timestamp: new Date().toISOString() }); diff --git a/website-monitoring-frontend/.github/workflows/cron-scan.yml b/website-monitoring-frontend/.github/workflows/cron-scan.yml index 28cce32..3a5c370 100644 --- a/website-monitoring-frontend/.github/workflows/cron-scan.yml +++ b/website-monitoring-frontend/.github/workflows/cron-scan.yml @@ -26,7 +26,7 @@ jobs: DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}" echo "Running uptime checks at: $DEPLOYMENT_URL/api/cron/uptime" - response=$(curl -s -w "\n%{http_code}" "$DEPLOYMENT_URL/api/cron/uptime") + response=$(curl -s -w "\n%{http_code}" -H "Authorization: Bearer $CRON_SECRET" "$DEPLOYMENT_URL/api/cron/uptime") http_code=$(echo "$response" | tail -n1) response_body=$(echo "$response" | head -n -1) @@ -41,6 +41,7 @@ jobs: fi env: DEPLOYMENT_URL: ${{ secrets.DEPLOYMENT_URL }} + CRON_SECRET: ${{ secrets.CRON_SECRET }} CRON_SECRET: ${{ secrets.CRON_SECRET }} scan: runs-on: ubuntu-latest @@ -51,7 +52,7 @@ jobs: DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}" echo "Triggering scan at: $DEPLOYMENT_URL/api/cron/scan?mode=all" - response=$(curl -s -w "\n%{http_code}" -X POST "$DEPLOYMENT_URL/api/cron/scan?mode=all") + response=$(curl -s -w "\n%{http_code}" -X POST -H "Authorization: Bearer $CRON_SECRET" "$DEPLOYMENT_URL/api/cron/scan?mode=all") http_code=$(echo "$response" | tail -n1) response_body=$(echo "$response" | head -n -1) diff --git a/website-monitoring-frontend/Dockerfile b/website-monitoring-frontend/Dockerfile index ffe1832..13e91bd 100644 --- a/website-monitoring-frontend/Dockerfile +++ b/website-monitoring-frontend/Dockerfile @@ -1,16 +1,38 @@ -FROM node:18 - +# --- Stage 1: Dependencies --- +FROM node:20-slim AS deps WORKDIR /app - COPY package.json package-lock.json ./ +RUN npm ci -# Disable the oxide engine so it falls back to JS -ENV TAILWIND_DISABLE_OXIDE=1 - -RUN npm install - +# --- Stage 2: Build --- +FROM node:20-slim AS builder +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules COPY . . - +ENV NEXT_TELEMETRY_DISABLED=1 +ENV TAILWIND_DISABLE_OXIDE=1 RUN npm run build -CMD ["npm", "run", "start"] +# --- Stage 3: Production --- +FROM node:20-slim AS runtime +WORKDIR /app + +RUN groupadd -r app && useradd -r -g app -d /app app + +COPY --from=builder --chown=app:app /app/.next/standalone ./ +COPY --from=builder --chown=app:app /app/.next/static ./.next/static +COPY --from=builder --chown=app:app /app/public ./public + +USER app + +EXPOSE 3000 + +ENV PORT=3000 +ENV HOSTNAME="0.0.0.0" +ENV NODE_ENV=production +ENV NEXT_TELEMETRY_DISABLED=1 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD node -e "const h=require('http');h.get('http://localhost:3000/api/health',(r)=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))" + +CMD ["node", "server.js"] diff --git a/website-monitoring-frontend/database-schema.sql b/website-monitoring-frontend/database-schema.sql index 23e9f68..0b1162d 100644 --- a/website-monitoring-frontend/database-schema.sql +++ b/website-monitoring-frontend/database-schema.sql @@ -203,4 +203,8 @@ CREATE TABLE IF NOT EXISTS alert_configurations ( created_at timestamp with time zone DEFAULT now(), updated_at timestamp with time zone DEFAULT now() ); + +-- Add template_hash to pages table for layout deduplication +ALTER TABLE pages ADD COLUMN IF NOT EXISTS template_hash VARCHAR; +CREATE INDEX IF NOT EXISTS idx_pages_template_hash ON pages(template_hash) WHERE template_hash IS NOT NULL; ); \ No newline at end of file diff --git a/website-monitoring-frontend/next.config.ts b/website-monitoring-frontend/next.config.ts index f0c7b13..256f16f 100644 --- a/website-monitoring-frontend/next.config.ts +++ b/website-monitoring-frontend/next.config.ts @@ -1,10 +1,27 @@ import type { NextConfig } from "next"; +const securityHeaders = [ + { key: "X-DNS-Prefetch-Control", value: "on" }, + { key: "Strict-Transport-Security", value: "max-age=63072000; includeSubDomains; preload" }, + { key: "X-Frame-Options", value: "SAMEORIGIN" }, + { key: "X-Content-Type-Options", value: "nosniff" }, + { key: "Referrer-Policy", value: "origin-when-cross-origin" }, + { key: "Permissions-Policy", value: "camera=(), microphone=(), geolocation=()" }, +]; + const nextConfig: NextConfig = { eslint: { // Do not fail production builds due to ESLint errors ignoreDuringBuilds: true, }, + async headers() { + return [ + { + source: "/(.*)", + headers: securityHeaders, + }, + ]; + }, }; export default nextConfig; diff --git a/website-monitoring-frontend/setup-database.sql b/website-monitoring-frontend/setup-database.sql index 979bbd7..0e3ec79 100644 --- a/website-monitoring-frontend/setup-database.sql +++ b/website-monitoring-frontend/setup-database.sql @@ -207,6 +207,7 @@ CREATE TABLE IF NOT EXISTS pages ( title VARCHAR, description TEXT, content_hash VARCHAR, + template_hash VARCHAR, content_type VARCHAR, status_code INTEGER, is_active BOOLEAN DEFAULT true, diff --git a/website-monitoring-frontend/src/app/api/admin/organizations/route.ts b/website-monitoring-frontend/src/app/api/admin/organizations/route.ts index 0376242..6c87cd9 100644 --- a/website-monitoring-frontend/src/app/api/admin/organizations/route.ts +++ b/website-monitoring-frontend/src/app/api/admin/organizations/route.ts @@ -1,12 +1,17 @@ import { NextResponse } from "next/server"; import { getSupabaseAdmin } from "@/lib/admin"; +import { requireAdmin } from "@/lib/apiAuth"; /** * GET /api/admin/organizations * * List all organizations with usage stats. + * Requires admin or owner role. */ export async function GET(request: Request) { + const auth = await requireAdmin(request); + if (auth instanceof NextResponse) return auth; + try { const supabase = getSupabaseAdmin(); const url = new URL(request.url); @@ -68,6 +73,9 @@ export async function GET(request: Request) { * Update organization: change tier, deactivate, etc. */ export async function PATCH(request: Request) { + const auth = await requireAdmin(request); + if (auth instanceof NextResponse) return auth; + try { const supabase = getSupabaseAdmin(); const { organizationId, updates } = await request.json(); diff --git a/website-monitoring-frontend/src/app/api/admin/stats/route.ts b/website-monitoring-frontend/src/app/api/admin/stats/route.ts index 5522d68..da492ec 100644 --- a/website-monitoring-frontend/src/app/api/admin/stats/route.ts +++ b/website-monitoring-frontend/src/app/api/admin/stats/route.ts @@ -1,12 +1,17 @@ import { NextResponse } from "next/server"; import { getSupabaseAdmin } from "@/lib/admin"; +import { requireAdmin } from "@/lib/apiAuth"; /** * GET /api/admin/stats * * Returns system-wide statistics for the admin dashboard. + * Requires admin or owner role. */ -export async function GET() { +export async function GET(request: Request) { + const auth = await requireAdmin(request); + if (auth instanceof NextResponse) return auth; + try { const supabase = getSupabaseAdmin(); diff --git a/website-monitoring-frontend/src/app/api/admin/users/route.ts b/website-monitoring-frontend/src/app/api/admin/users/route.ts index 46345be..8c31a04 100644 --- a/website-monitoring-frontend/src/app/api/admin/users/route.ts +++ b/website-monitoring-frontend/src/app/api/admin/users/route.ts @@ -1,13 +1,18 @@ import { NextResponse } from "next/server"; import { getSupabaseAdmin } from "@/lib/admin"; +import { requireAdmin } from "@/lib/apiAuth"; /** * GET /api/admin/users * * List all users with their organization memberships and usage stats. * Query params: ?page=1&limit=20&search=keyword + * Requires admin or owner role. */ export async function GET(request: Request) { + const auth = await requireAdmin(request); + if (auth instanceof NextResponse) return auth; + try { const supabase = getSupabaseAdmin(); const url = new URL(request.url); @@ -79,6 +84,9 @@ export async function GET(request: Request) { * Body: { userId, action, value } */ export async function PATCH(request: Request) { + const auth = await requireAdmin(request); + if (auth instanceof NextResponse) return auth; + try { const supabase = getSupabaseAdmin(); const { userId, action, value } = await request.json(); @@ -152,6 +160,9 @@ export async function PATCH(request: Request) { * Body: { userId } */ export async function DELETE(request: Request) { + const auth = await requireAdmin(request); + if (auth instanceof NextResponse) return auth; + try { const supabase = getSupabaseAdmin(); const { userId } = await request.json(); diff --git a/website-monitoring-frontend/src/app/api/billing/usage/route.ts b/website-monitoring-frontend/src/app/api/billing/usage/route.ts index 7e085bb..4f47e4e 100644 --- a/website-monitoring-frontend/src/app/api/billing/usage/route.ts +++ b/website-monitoring-frontend/src/app/api/billing/usage/route.ts @@ -1,16 +1,17 @@ import { NextResponse } from "next/server"; import { getSupabaseAdmin } from "@/lib/admin"; import { TIER_LIMITS } from "@/services/tierLimits"; +import { requireOrgMembership } from "@/lib/apiAuth"; /** * GET /api/billing/usage * * Returns current usage vs tier limits for an organization. + * Requires authenticated user who is a member of the organization. * Query params: ?organizationId=xxx */ export async function GET(request: Request) { try { - const supabase = getSupabaseAdmin(); const url = new URL(request.url); const organizationId = url.searchParams.get("organizationId"); @@ -18,6 +19,12 @@ export async function GET(request: Request) { return NextResponse.json({ error: "organizationId required" }, { status: 400 }); } + // Verify caller belongs to this organization + const auth = await requireOrgMembership(organizationId, request); + if (auth instanceof NextResponse) return auth; + + const supabase = getSupabaseAdmin(); + // Get organization with tier info const { data: org, error: orgError } = await supabase .from("organizations") diff --git a/website-monitoring-frontend/src/app/api/cron/scan/route.ts b/website-monitoring-frontend/src/app/api/cron/scan/route.ts index c1150b2..1c95751 100644 --- a/website-monitoring-frontend/src/app/api/cron/scan/route.ts +++ b/website-monitoring-frontend/src/app/api/cron/scan/route.ts @@ -2,8 +2,12 @@ import { NextResponse } from "next/server"; import { scanScheduler } from "@/services/scanScheduler"; import { lighthouseScanner } from "@/services/lighthouseScanner"; import { logError } from "@/utils/errorUtils"; +import { verifyCronSecret } from "@/lib/apiAuth"; export async function GET(request: Request) { + const authError = verifyCronSecret(request); + if (authError) return authError; + try { const url = new URL(request.url); const mode = url.searchParams.get("mode") || "all"; // "scheduled", "change_detection", "all" diff --git a/website-monitoring-frontend/src/app/api/cron/uptime/route.ts b/website-monitoring-frontend/src/app/api/cron/uptime/route.ts index 50c07ac..0e3fd70 100644 --- a/website-monitoring-frontend/src/app/api/cron/uptime/route.ts +++ b/website-monitoring-frontend/src/app/api/cron/uptime/route.ts @@ -1,16 +1,21 @@ import { NextResponse } from "next/server"; import { performUptimeChecks, evaluateUptimeAlerts } from "@/services/uptimeService"; +import { verifyCronSecret } from "@/lib/apiAuth"; /** * GET /api/cron/uptime * * Performs uptime checks on all active websites and evaluates alert rules. * Designed to be called by a cron job (e.g., GitHub Actions, Vercel Cron, or external scheduler). + * Requires CRON_SECRET authorization in production. * * Query params: * - alerts=true (default) — also evaluate alert rules after checks */ export async function GET(request: Request) { + const authError = verifyCronSecret(request); + if (authError) return authError; + const startTime = Date.now(); try { diff --git a/website-monitoring-frontend/src/lib/apiAuth.ts b/website-monitoring-frontend/src/lib/apiAuth.ts new file mode 100644 index 0000000..8e7ebb3 --- /dev/null +++ b/website-monitoring-frontend/src/lib/apiAuth.ts @@ -0,0 +1,148 @@ +import { createClient } from "@supabase/supabase-js"; +import { NextResponse } from "next/server"; +import { getSupabaseAdmin } from "./admin"; + +/** + * Verify CRON_SECRET for cron endpoints. + * Returns null if valid, or a NextResponse error if invalid. + */ +export function verifyCronSecret(request: Request): NextResponse | null { + const authHeader = request.headers.get("authorization"); + const cronSecret = process.env.CRON_SECRET; + + if (!cronSecret) { + // If no secret configured, allow in development only + if (process.env.NODE_ENV === "development") return null; + return NextResponse.json( + { error: "CRON_SECRET not configured" }, + { status: 500 } + ); + } + + if (authHeader !== `Bearer ${cronSecret}`) { + return NextResponse.json( + { error: "Unauthorized" }, + { status: 401 } + ); + } + + return null; +} + +interface AuthResult { + userId: string; + role: string | null; + organizationId: string | null; +} + +/** + * Authenticate the current user from the request cookies or Authorization header. + * Returns user info or a NextResponse error. + */ +export async function authenticateUser(request?: Request): Promise { + try { + // Try to get the access token from the Authorization header or cookies + let accessToken: string | null = null; + + if (request) { + const authHeader = request.headers.get("authorization"); + if (authHeader?.startsWith("Bearer ")) { + accessToken = authHeader.slice(7); + } + + // Try to extract from Supabase auth cookie + if (!accessToken) { + const cookieHeader = request.headers.get("cookie") || ""; + const match = cookieHeader.match(/sb-[^=]+-auth-token=([^;]+)/); + if (match) { + try { + const decoded = decodeURIComponent(match[1]); + const parsed = JSON.parse(decoded); + accessToken = parsed?.[0] || parsed?.access_token || null; + } catch { + // Cookie might be the token directly + accessToken = match[1]; + } + } + } + } + + if (!accessToken) { + return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + } + + // Verify the token using Supabase + const supabase = createClient( + process.env.NEXT_PUBLIC_SUPABASE_URL!, + process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY! + ); + + const { data: { user }, error } = await supabase.auth.getUser(accessToken); + + if (error || !user) { + return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + } + + // Get user details (role, org) + const admin = getSupabaseAdmin(); + const { data: userData } = await admin + .from("users") + .select("organization_id, role") + .eq("id", user.id) + .single(); + + return { + userId: user.id, + role: (userData?.role as string) || user.user_metadata?.role || null, + organizationId: (userData?.organization_id as string) || null, + }; + } catch { + return NextResponse.json({ error: "Authentication failed" }, { status: 401 }); + } +} + +/** + * Require admin role. Returns AuthResult if authorized, or a NextResponse error. + */ +export async function requireAdmin(request?: Request): Promise { + const auth = await authenticateUser(request); + if (auth instanceof NextResponse) return auth; + + if (auth.role !== "owner" && auth.role !== "admin") { + return NextResponse.json({ error: "Forbidden: admin access required" }, { status: 403 }); + } + + return auth; +} + +/** + * Require membership in the given organization. + */ +export async function requireOrgMembership( + organizationId: string, + request?: Request +): Promise { + const auth = await authenticateUser(request); + if (auth instanceof NextResponse) return auth; + + // Admins/owners can access any org + if (auth.role === "owner" || auth.role === "admin") return auth; + + // Check org membership + const admin = getSupabaseAdmin(); + const { data: membership } = await admin + .from("organization_members") + .select("id") + .eq("user_id", auth.userId) + .eq("organization_id", organizationId) + .single(); + + if (!membership) { + return NextResponse.json( + { error: "Forbidden: not a member of this organization" }, + { status: 403 } + ); + } + + return auth; +} diff --git a/website-monitoring-frontend/src/lib/validateEnv.ts b/website-monitoring-frontend/src/lib/validateEnv.ts new file mode 100644 index 0000000..08f748d --- /dev/null +++ b/website-monitoring-frontend/src/lib/validateEnv.ts @@ -0,0 +1,52 @@ +/** + * Validates that required environment variables are present. + * Call this at server startup or in API routes. + */ +const REQUIRED_SERVER_VARS = [ + "NEXT_PUBLIC_SUPABASE_URL", + "NEXT_PUBLIC_SUPABASE_ANON_KEY", +]; + +const REQUIRED_FOR_ADMIN = [ + "SUPABASE_SERVICE_ROLE_KEY", +]; + +const OPTIONAL_VARS = [ + "CRON_SECRET", + "RESEND_API_KEY", + "LIGHTHOUSE_SERVICE_URL", +]; + +export function validateEnv(): { valid: boolean; missing: string[]; warnings: string[] } { + const missing: string[] = []; + const warnings: string[] = []; + + for (const v of REQUIRED_SERVER_VARS) { + if (!process.env[v]) missing.push(v); + } + + for (const v of REQUIRED_FOR_ADMIN) { + if (!process.env[v]) missing.push(v); + } + + for (const v of OPTIONAL_VARS) { + if (!process.env[v]) warnings.push(`${v} not set — related features will be disabled`); + } + + if (!process.env.CRON_SECRET && process.env.NODE_ENV === "production") { + warnings.push("CRON_SECRET not set — cron endpoints are unprotected in production!"); + } + + return { valid: missing.length === 0, missing, warnings }; +} + +// Auto-validate on import (server-side only) +if (typeof window === "undefined") { + const { valid, missing, warnings } = validateEnv(); + if (!valid) { + console.error(`[ENV] Missing required environment variables: ${missing.join(", ")}`); + } + for (const w of warnings) { + console.warn(`[ENV] ${w}`); + } +} diff --git a/website-monitoring-frontend/src/services/newCrawlerService.ts b/website-monitoring-frontend/src/services/newCrawlerService.ts index 68686e5..265f871 100644 --- a/website-monitoring-frontend/src/services/newCrawlerService.ts +++ b/website-monitoring-frontend/src/services/newCrawlerService.ts @@ -219,6 +219,7 @@ export class NewCrawlerService { try { const urlObj = new URL(url); const contentHash = await this.computeContentHash(html); + const templateHash = await this.computeTemplateHash(html); // Check if page already exists const { data: existingPage } = await getSupabaseAdmin() @@ -236,6 +237,7 @@ export class NewCrawlerService { title, description, content_hash: contentHash, + template_hash: templateHash, last_crawled_at: new Date().toISOString(), metadata: { crawl_session_id: this.sessionId, @@ -254,6 +256,7 @@ export class NewCrawlerService { title, description, content_hash: contentHash, + template_hash: templateHash, content_type: "text/html", status_code: 200, depth: this.currentDepth, @@ -272,6 +275,51 @@ export class NewCrawlerService { } } + /** + * Compute a template hash from HTML — strips text content and dynamic attributes, + * keeping only the DOM structure (tag names, class names, hierarchy). + * Pages with the same layout (e.g., product pages) will share the same template_hash. + */ + private async computeTemplateHash(html: string): Promise { + try { + const { JSDOM: JSDOMParser } = await import("jsdom"); + const dom = new JSDOMParser(html); + const skeleton = this.extractDomSkeleton(dom.window.document.body); + return this.computeContentHash(skeleton); + } catch { + // Fallback: hash the raw HTML if JSDOM fails + return this.computeContentHash(html); + } + } + + /** + * Extract a structural skeleton of the DOM: tag names + class names only. + * This ignores text content, ids, data attributes, images, etc. + */ + private extractDomSkeleton(element: Element | null): string { + if (!element) return ""; + + const parts: string[] = []; + const walk = (el: Element, depth: number) => { + if (depth > 20) return; // prevent infinite recursion + const tag = el.tagName.toLowerCase(); + // Skip script, style, svg, noscript — they're not layout + if (["script", "style", "svg", "noscript"].includes(tag)) return; + + const classes = el.className && typeof el.className === "string" + ? el.className.split(/\s+/).sort().join(".") + : ""; + parts.push(`${" ".repeat(depth)}<${tag}${classes ? "." + classes : ""}>`); + + for (const child of Array.from(el.children)) { + walk(child, depth + 1); + } + }; + + walk(element, 0); + return parts.join("\n"); + } + private extractLinks(document: Document, baseUrl: string): string[] { const links = Array.from(document.querySelectorAll("a[href]")) .map((link) => { diff --git a/website-monitoring-frontend/src/services/scanScheduler.ts b/website-monitoring-frontend/src/services/scanScheduler.ts index fb938d3..0f1190b 100644 --- a/website-monitoring-frontend/src/services/scanScheduler.ts +++ b/website-monitoring-frontend/src/services/scanScheduler.ts @@ -105,7 +105,8 @@ export class ScanScheduler { } /** - * Process a single scheduled scan + * Process a single scheduled scan — scans ALL unique-layout pages, not just the main page. + * Groups pages by template_hash to avoid scanning duplicate layouts (e.g., product pages). */ private async processScheduledScan(scheduledScan: ScheduledScan): Promise { try { @@ -123,35 +124,54 @@ export class ScanScheduler { return; } - // Get the main page for this website - const { data: page, error: pageError } = await this.supabase + // Get ALL active pages for this website, grouped by template_hash + const { data: pages, error: pageError } = await this.supabase .from('pages') - .select('id') + .select('id, url, path, template_hash, depth, content_hash') .eq('website_id', scheduledScan.websiteId) .eq('is_active', true) - .order('created_at', { ascending: false }) - .limit(1) - .single(); + .order('depth', { ascending: true }) + .order('created_at', { ascending: true }); - if (pageError || !page) { - logError('No active page found for scheduled scan', pageError, { + if (pageError || !pages || pages.length === 0) { + logError('No active pages found for scheduled scan', pageError, { websiteId: scheduledScan.websiteId, }); return; } - // Perform scans for each device type - for (const deviceType of scheduledScan.deviceTypes) { - const scanConfig: ScanConfig = { - websiteId: scheduledScan.websiteId, - pageId: page.id as string, - deviceType, - categories: scheduledScan.categories, - priority: 'medium', - triggeredBy: 'scheduled', - }; + // Deduplicate pages by template_hash — scan only one page per unique layout + const uniquePages = this.deduplicateByLayout(pages); - await lighthouseScanner.performScan(scanConfig); + console.info(JSON.stringify({ + level: 'info', + event: 'scan_pages_selected', + websiteId: scheduledScan.websiteId, + totalPages: pages.length, + uniqueLayouts: uniquePages.length, + timestamp: new Date().toISOString(), + })); + + // Scan each unique page + for (const page of uniquePages) { + // Re-check limits before each scan + const { canScan: stillCanScan } = await lighthouseScanner.checkSubscriptionLimits( + scheduledScan.organizationId + ); + if (!stillCanScan) break; + + for (const deviceType of scheduledScan.deviceTypes) { + const scanConfig: ScanConfig = { + websiteId: scheduledScan.websiteId, + pageId: page.id as string, + deviceType, + categories: scheduledScan.categories, + priority: (page.depth as number) === 0 ? 'high' : 'medium', + triggeredBy: 'scheduled', + }; + + await lighthouseScanner.performScan(scanConfig); + } } // Update the last run time @@ -161,6 +181,45 @@ export class ScanScheduler { } } + /** + * Deduplicate pages by template_hash. + * If a template_hash exists, only scan the shallowest (lowest depth) page with that hash. + * Pages without template_hash are always included (treated as unique). + * The main page (depth=0) is always included. + */ + private deduplicateByLayout(pages: Record[]): Record[] { + const seenHashes = new Set(); + const result: Record[] = []; + + for (const page of pages) { + const depth = Number(page.depth ?? 0); + const templateHash = page.template_hash as string | null; + + // Always include the main page + if (depth === 0) { + if (templateHash) seenHashes.add(templateHash); + result.push(page); + continue; + } + + // If no template_hash, fall back to content_hash for dedup + const hash = templateHash || (page.content_hash as string | null); + + if (!hash) { + // No hash at all — include it (unique by default) + result.push(page); + continue; + } + + if (!seenHashes.has(hash)) { + seenHashes.add(hash); + result.push(page); + } + } + + return result; + } + /** * Check for website changes and trigger scans if needed */ @@ -213,24 +272,23 @@ export class ScanScheduler { return; } - // Get the main page - const { data: page, error: pageError } = await this.supabase + // Get all unique-layout pages (not just main) + const { data: pages, error: pageError } = await this.supabase .from('pages') - .select('id') + .select('id, url, path, template_hash, depth, content_hash') .eq('website_id', website.id) .eq('is_active', true) - .order('created_at', { ascending: false }) - .limit(1) - .single(); + .order('depth', { ascending: true }); - if (pageError || !page) { + if (pageError || !pages || pages.length === 0) { return; } - // Trigger a high-priority scan due to changes + // Scan main page with high priority on change + const mainPage = pages[0]; const scanConfig: ScanConfig = { websiteId: website.id, - pageId: page.id as string, + pageId: mainPage.id as string, deviceType: 'desktop', // Start with desktop for change detection categories: ['performance', 'accessibility', 'seo', 'best_practices'], priority: 'high',