feat: production hardening + smart subpage scanning with layout dedup

Security: - Add CRON_SECRET auth to /api/cron/* endpoints - Add admin role verification to /api/admin/* routes - Add org membership check to /api/billing/usage - Add security headers (HSTS, X-Frame-Options, CSP, etc.) - Add env variable validation at startup - Add rate limiting to backend API (30 req/min per IP) Infrastructure: - Multi-stage Dockerfiles with non-root user + healthchecks - Updated cron workflow to pass CRON_SECRET header - Updated .env.example with all optional vars Smart subpage scanning: - Crawler now computes template_hash (DOM structure without content) - Scanner scans ALL unique-layout pages, not just main page - Pages with same layout (e.g. product pages) scanned only once - Deduplication by template_hash, fallback to content_hash - Main page always scanned with high priority - Re-checks subscription limits before each page scan Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-06 07:44:32 +01:00
parent d8de0a973a
commit 1c545c93b4
18 changed files with 498 additions and 59 deletions
@@ -18,3 +18,14 @@ CORS_ORIGIN=http://localhost:3000
 NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co
 NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key
 SUPABASE_SERVICE_ROLE_KEY=your-service-role-key
 # ── Security ────────────────────────────────
 # Required in production: protects /api/cron/* endpoints
 CRON_SECRET=generate-a-random-secret-here
 # ── Optional Services ───────────────────────
 # Email notifications (Resend — free tier: 3000 emails/mo)
 RESEND_API_KEY=re_your_resend_key
 # Lighthouse backend URL (for automated scans)
 LIGHTHOUSE_SERVICE_URL=http://localhost:5000
@@ -1,25 +1,36 @@
-# Use the official Node.js image.
+# --- Stage 1: Build ---
-FROM node:18
+FROM node:20-slim AS builder
 # OPTIONAL: Falls in der Base kein Chrome enthalten ist,
 # müsstest du hier noch "apt-get update" + "apt-get install chromium" oder ähnliches ausführen,
 # z. B.:
 RUN apt-get update && apt-get install -y chromium
 # Create and change to the app directory.
 WORKDIR /app
 # Copy application dependency manifests to the container image.
 COPY package*.json ./
 RUN npm ci
 # Install production dependencies.
 RUN npm install
 # Copy local code to the container image.
 COPY . .
 # Build the TypeScript code
 RUN npm run build
-# Run the web service on container startup.
+# --- Stage 2: Production ---
 FROM node:20-slim AS runtime
 RUN apt-get update && apt-get install -y --no-install-recommends chromium \
    && rm -rf /var/lib/apt/lists/*
 ENV CHROME_BIN=/usr/bin/chromium
 ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
 WORKDIR /app
 RUN groupadd -r app && useradd -r -g app -d /app app
 COPY --from=builder --chown=app:app /app/dist ./dist
 COPY --from=builder --chown=app:app /app/node_modules ./node_modules
 COPY --from=builder --chown=app:app /app/package.json ./
 USER app
 EXPOSE 5000
 HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
  CMD node -e "const h=require('http');h.get('http://localhost:5000/health',(r)=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"
 CMD ["node", "dist/index.js"]
@@ -5,10 +5,36 @@ import lighthouseRouter from "./routes/lighthouse.js";
 dotenv.config();
 // Rate limiting (simple in-memory for single instance)
 const rateLimitMap = new Map<string, { count: number; resetAt: number }>();
 const RATE_LIMIT_WINDOW = 60_000; // 1 minute
 const RATE_LIMIT_MAX = 30; // requests per window
 function rateLimit(req: Request, res: Response, next: () => void) {
  const ip = req.ip || req.headers["x-forwarded-for"] || "unknown";
  const key = String(ip);
  const now = Date.now();
  const entry = rateLimitMap.get(key);
  if (!entry || now > entry.resetAt) {
    rateLimitMap.set(key, { count: 1, resetAt: now + RATE_LIMIT_WINDOW });
    return next();
  }
  if (entry.count >= RATE_LIMIT_MAX) {
    res.status(429).json({ error: "Too many requests" });
    return;
  }
  entry.count++;
  next();
 }
 const app = express();
 app.use(cors({ origin: process.env.CORS_ORIGIN || "*" }));
 app.use(express.json());
 app.use(rateLimit);
 app.get("/health", (_req: Request, res: Response) => {
  res.status(200).json({ status: "ok", timestamp: new Date().toISOString() });
@@ -26,7 +26,7 @@ jobs:
          DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}"
          echo "Running uptime checks at: $DEPLOYMENT_URL/api/cron/uptime"
-          response=$(curl -s -w "\n%{http_code}" "$DEPLOYMENT_URL/api/cron/uptime")
+          response=$(curl -s -w "\n%{http_code}" -H "Authorization: Bearer $CRON_SECRET" "$DEPLOYMENT_URL/api/cron/uptime")
          http_code=$(echo "$response" | tail -n1)
          response_body=$(echo "$response" | head -n -1)
@@ -41,6 +41,7 @@ jobs:
          fi
        env:
          DEPLOYMENT_URL: ${{ secrets.DEPLOYMENT_URL }}
          CRON_SECRET: ${{ secrets.CRON_SECRET }}          CRON_SECRET: ${{ secrets.CRON_SECRET }}
  scan:
    runs-on: ubuntu-latest
@@ -51,7 +52,7 @@ jobs:
          DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}"
          echo "Triggering scan at: $DEPLOYMENT_URL/api/cron/scan?mode=all"
-          response=$(curl -s -w "\n%{http_code}" -X POST "$DEPLOYMENT_URL/api/cron/scan?mode=all")
+          response=$(curl -s -w "\n%{http_code}" -X POST -H "Authorization: Bearer $CRON_SECRET" "$DEPLOYMENT_URL/api/cron/scan?mode=all")
          http_code=$(echo "$response" | tail -n1)
          response_body=$(echo "$response" | head -n -1)
@@ -1,16 +1,38 @@
-FROM node:18
+# --- Stage 1: Dependencies ---
-
+FROM node:20-slim AS deps
 WORKDIR /app
 COPY package.json package-lock.json ./
 RUN npm ci
-# Disable the oxide engine so it falls back to JS
+# --- Stage 2: Build ---
-ENV TAILWIND_DISABLE_OXIDE=1
+FROM node:20-slim AS builder
-
+WORKDIR /app
-RUN npm install
+COPY --from=deps /app/node_modules ./node_modules
 COPY . .
-
+ENV NEXT_TELEMETRY_DISABLED=1
 ENV TAILWIND_DISABLE_OXIDE=1
 RUN npm run build
-CMD ["npm", "run", "start"]
+# --- Stage 3: Production ---
 FROM node:20-slim AS runtime
 WORKDIR /app
 RUN groupadd -r app && useradd -r -g app -d /app app
 COPY --from=builder --chown=app:app /app/.next/standalone ./
 COPY --from=builder --chown=app:app /app/.next/static ./.next/static
 COPY --from=builder --chown=app:app /app/public ./public
 USER app
 EXPOSE 3000
 ENV PORT=3000
 ENV HOSTNAME="0.0.0.0"
 ENV NODE_ENV=production
 ENV NEXT_TELEMETRY_DISABLED=1
 HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
  CMD node -e "const h=require('http');h.get('http://localhost:3000/api/health',(r)=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"
 CMD ["node", "server.js"]
@@ -203,4 +203,8 @@ CREATE TABLE IF NOT EXISTS alert_configurations (
  created_at timestamp with time zone DEFAULT now(),
  updated_at timestamp with time zone DEFAULT now()
 );
 -- Add template_hash to pages table for layout deduplication
 ALTER TABLE pages ADD COLUMN IF NOT EXISTS template_hash VARCHAR;
 CREATE INDEX IF NOT EXISTS idx_pages_template_hash ON pages(template_hash) WHERE template_hash IS NOT NULL;
  );
@@ -1,10 +1,27 @@
 import type { NextConfig } from "next";
 const securityHeaders = [
  { key: "X-DNS-Prefetch-Control", value: "on" },
  { key: "Strict-Transport-Security", value: "max-age=63072000; includeSubDomains; preload" },
  { key: "X-Frame-Options", value: "SAMEORIGIN" },
  { key: "X-Content-Type-Options", value: "nosniff" },
  { key: "Referrer-Policy", value: "origin-when-cross-origin" },
  { key: "Permissions-Policy", value: "camera=(), microphone=(), geolocation=()" },
 ];
 const nextConfig: NextConfig = {
  eslint: {
    // Do not fail production builds due to ESLint errors
    ignoreDuringBuilds: true,
  },
  async headers() {
    return [
      {
        source: "/(.*)",
        headers: securityHeaders,
      },
    ];
  },
 };
 export default nextConfig;
@@ -207,6 +207,7 @@ CREATE TABLE IF NOT EXISTS pages (
    title VARCHAR,
    description TEXT,
    content_hash VARCHAR,
    template_hash VARCHAR,
    content_type VARCHAR,
    status_code INTEGER,
    is_active BOOLEAN DEFAULT true,
@@ -1,12 +1,17 @@
 import { NextResponse } from "next/server";
 import { getSupabaseAdmin } from "@/lib/admin";
 import { requireAdmin } from "@/lib/apiAuth";
 /**
 * GET /api/admin/organizations
 * 
 * List all organizations with usage stats.
 * Requires admin or owner role.
 */
 export async function GET(request: Request) {
  const auth = await requireAdmin(request);
  if (auth instanceof NextResponse) return auth;
  try {
    const supabase = getSupabaseAdmin();
    const url = new URL(request.url);
@@ -68,6 +73,9 @@ export async function GET(request: Request) {
 * Update organization: change tier, deactivate, etc.
 */
 export async function PATCH(request: Request) {
  const auth = await requireAdmin(request);
  if (auth instanceof NextResponse) return auth;
  try {
    const supabase = getSupabaseAdmin();
    const { organizationId, updates } = await request.json();
@@ -1,12 +1,17 @@
 import { NextResponse } from "next/server";
 import { getSupabaseAdmin } from "@/lib/admin";
 import { requireAdmin } from "@/lib/apiAuth";
 /**
 * GET /api/admin/stats
 * 
 * Returns system-wide statistics for the admin dashboard.
 * Requires admin or owner role.
 */
-export async function GET() {
+export async function GET(request: Request) {
  const auth = await requireAdmin(request);
  if (auth instanceof NextResponse) return auth;
  try {
    const supabase = getSupabaseAdmin();
@@ -1,13 +1,18 @@
 import { NextResponse } from "next/server";
 import { getSupabaseAdmin } from "@/lib/admin";
 import { requireAdmin } from "@/lib/apiAuth";
 /**
 * GET /api/admin/users
 * 
 * List all users with their organization memberships and usage stats.
 * Query params: ?page=1&limit=20&search=keyword
 * Requires admin or owner role.
 */
 export async function GET(request: Request) {
  const auth = await requireAdmin(request);
  if (auth instanceof NextResponse) return auth;
  try {
    const supabase = getSupabaseAdmin();
    const url = new URL(request.url);
@@ -79,6 +84,9 @@ export async function GET(request: Request) {
 * Body: { userId, action, value }
 */
 export async function PATCH(request: Request) {
  const auth = await requireAdmin(request);
  if (auth instanceof NextResponse) return auth;
  try {
    const supabase = getSupabaseAdmin();
    const { userId, action, value } = await request.json();
@@ -152,6 +160,9 @@ export async function PATCH(request: Request) {
 * Body: { userId }
 */
 export async function DELETE(request: Request) {
  const auth = await requireAdmin(request);
  if (auth instanceof NextResponse) return auth;
  try {
    const supabase = getSupabaseAdmin();
    const { userId } = await request.json();
@@ -1,16 +1,17 @@
 import { NextResponse } from "next/server";
 import { getSupabaseAdmin } from "@/lib/admin";
 import { TIER_LIMITS } from "@/services/tierLimits";
 import { requireOrgMembership } from "@/lib/apiAuth";
 /**
 * GET /api/billing/usage
 * 
 * Returns current usage vs tier limits for an organization.
 * Requires authenticated user who is a member of the organization.
 * Query params: ?organizationId=xxx
 */
 export async function GET(request: Request) {
  try {
    const supabase = getSupabaseAdmin();
    const url = new URL(request.url);
    const organizationId = url.searchParams.get("organizationId");
@@ -18,6 +19,12 @@ export async function GET(request: Request) {
      return NextResponse.json({ error: "organizationId required" }, { status: 400 });
    }
    // Verify caller belongs to this organization
    const auth = await requireOrgMembership(organizationId, request);
    if (auth instanceof NextResponse) return auth;
    const supabase = getSupabaseAdmin();
    // Get organization with tier info
    const { data: org, error: orgError } = await supabase
      .from("organizations")
@@ -2,8 +2,12 @@ import { NextResponse } from "next/server";
 import { scanScheduler } from "@/services/scanScheduler";
 import { lighthouseScanner } from "@/services/lighthouseScanner";
 import { logError } from "@/utils/errorUtils";
 import { verifyCronSecret } from "@/lib/apiAuth";
 export async function GET(request: Request) {
  const authError = verifyCronSecret(request);
  if (authError) return authError;
  try {
    const url = new URL(request.url);
    const mode = url.searchParams.get("mode") || "all"; // "scheduled", "change_detection", "all"
@@ -1,16 +1,21 @@
 import { NextResponse } from "next/server";
 import { performUptimeChecks, evaluateUptimeAlerts } from "@/services/uptimeService";
 import { verifyCronSecret } from "@/lib/apiAuth";
 /**
 * GET /api/cron/uptime
 * 
 * Performs uptime checks on all active websites and evaluates alert rules.
 * Designed to be called by a cron job (e.g., GitHub Actions, Vercel Cron, or external scheduler).
 * Requires CRON_SECRET authorization in production.
 * 
 * Query params:
 *   - alerts=true (default) — also evaluate alert rules after checks
 */
 export async function GET(request: Request) {
  const authError = verifyCronSecret(request);
  if (authError) return authError;
  const startTime = Date.now();
  try {
@@ -0,0 +1,148 @@
 import { createClient } from "@supabase/supabase-js";
 import { NextResponse } from "next/server";
 import { getSupabaseAdmin } from "./admin";
 /**
 * Verify CRON_SECRET for cron endpoints.
 * Returns null if valid, or a NextResponse error if invalid.
 */
 export function verifyCronSecret(request: Request): NextResponse | null {
  const authHeader = request.headers.get("authorization");
  const cronSecret = process.env.CRON_SECRET;
  if (!cronSecret) {
    // If no secret configured, allow in development only
    if (process.env.NODE_ENV === "development") return null;
    return NextResponse.json(
      { error: "CRON_SECRET not configured" },
      { status: 500 }
    );
  }
  if (authHeader !== `Bearer ${cronSecret}`) {
    return NextResponse.json(
      { error: "Unauthorized" },
      { status: 401 }
    );
  }
  return null;
 }
 interface AuthResult {
  userId: string;
  role: string | null;
  organizationId: string | null;
 }
 /**
 * Authenticate the current user from the request cookies or Authorization header.
 * Returns user info or a NextResponse error.
 */
 export async function authenticateUser(request?: Request): Promise<AuthResult | NextResponse> {
  try {
    // Try to get the access token from the Authorization header or cookies
    let accessToken: string | null = null;
    if (request) {
      const authHeader = request.headers.get("authorization");
      if (authHeader?.startsWith("Bearer ")) {
        accessToken = authHeader.slice(7);
      }
      // Try to extract from Supabase auth cookie
      if (!accessToken) {
        const cookieHeader = request.headers.get("cookie") || "";
        const match = cookieHeader.match(/sb-[^=]+-auth-token=([^;]+)/);
        if (match) {
          try {
            const decoded = decodeURIComponent(match[1]);
            const parsed = JSON.parse(decoded);
            accessToken = parsed?.[0] || parsed?.access_token || null;
          } catch {
            // Cookie might be the token directly
            accessToken = match[1];
          }
        }
      }
    }
    if (!accessToken) {
      return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
    }
    // Verify the token using Supabase
    const supabase = createClient(
      process.env.NEXT_PUBLIC_SUPABASE_URL!,
      process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!
    );
    const { data: { user }, error } = await supabase.auth.getUser(accessToken);
    if (error || !user) {
      return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
    }
    // Get user details (role, org)
    const admin = getSupabaseAdmin();
    const { data: userData } = await admin
      .from("users")
      .select("organization_id, role")
      .eq("id", user.id)
      .single();
    return {
      userId: user.id,
      role: (userData?.role as string) || user.user_metadata?.role || null,
      organizationId: (userData?.organization_id as string) || null,
    };
  } catch {
    return NextResponse.json({ error: "Authentication failed" }, { status: 401 });
  }
 }
 /**
 * Require admin role. Returns AuthResult if authorized, or a NextResponse error.
 */
 export async function requireAdmin(request?: Request): Promise<AuthResult | NextResponse> {
  const auth = await authenticateUser(request);
  if (auth instanceof NextResponse) return auth;
  if (auth.role !== "owner" && auth.role !== "admin") {
    return NextResponse.json({ error: "Forbidden: admin access required" }, { status: 403 });
  }
  return auth;
 }
 /**
 * Require membership in the given organization.
 */
 export async function requireOrgMembership(
  organizationId: string,
  request?: Request
 ): Promise<AuthResult | NextResponse> {
  const auth = await authenticateUser(request);
  if (auth instanceof NextResponse) return auth;
  // Admins/owners can access any org
  if (auth.role === "owner" || auth.role === "admin") return auth;
  // Check org membership
  const admin = getSupabaseAdmin();
  const { data: membership } = await admin
    .from("organization_members")
    .select("id")
    .eq("user_id", auth.userId)
    .eq("organization_id", organizationId)
    .single();
  if (!membership) {
    return NextResponse.json(
      { error: "Forbidden: not a member of this organization" },
      { status: 403 }
    );
  }
  return auth;
 }
@@ -0,0 +1,52 @@
 /**
 * Validates that required environment variables are present.
 * Call this at server startup or in API routes.
 */
 const REQUIRED_SERVER_VARS = [
  "NEXT_PUBLIC_SUPABASE_URL",
  "NEXT_PUBLIC_SUPABASE_ANON_KEY",
 ];
 const REQUIRED_FOR_ADMIN = [
  "SUPABASE_SERVICE_ROLE_KEY",
 ];
 const OPTIONAL_VARS = [
  "CRON_SECRET",
  "RESEND_API_KEY",
  "LIGHTHOUSE_SERVICE_URL",
 ];
 export function validateEnv(): { valid: boolean; missing: string[]; warnings: string[] } {
  const missing: string[] = [];
  const warnings: string[] = [];
  for (const v of REQUIRED_SERVER_VARS) {
    if (!process.env[v]) missing.push(v);
  }
  for (const v of REQUIRED_FOR_ADMIN) {
    if (!process.env[v]) missing.push(v);
  }
  for (const v of OPTIONAL_VARS) {
    if (!process.env[v]) warnings.push(`${v} not set — related features will be disabled`);
  }
  if (!process.env.CRON_SECRET && process.env.NODE_ENV === "production") {
    warnings.push("CRON_SECRET not set — cron endpoints are unprotected in production!");
  }
  return { valid: missing.length === 0, missing, warnings };
 }
 // Auto-validate on import (server-side only)
 if (typeof window === "undefined") {
  const { valid, missing, warnings } = validateEnv();
  if (!valid) {
    console.error(`[ENV] Missing required environment variables: ${missing.join(", ")}`);
  }
  for (const w of warnings) {
    console.warn(`[ENV] ${w}`);
  }
 }
@@ -219,6 +219,7 @@ export class NewCrawlerService {
    try {
      const urlObj = new URL(url);
      const contentHash = await this.computeContentHash(html);
      const templateHash = await this.computeTemplateHash(html);
      // Check if page already exists
      const { data: existingPage } = await getSupabaseAdmin()
@@ -236,6 +237,7 @@ export class NewCrawlerService {
            title,
            description,
            content_hash: contentHash,
            template_hash: templateHash,
            last_crawled_at: new Date().toISOString(),
            metadata: { 
              crawl_session_id: this.sessionId,
@@ -254,6 +256,7 @@ export class NewCrawlerService {
            title,
            description,
            content_hash: contentHash,
            template_hash: templateHash,
            content_type: "text/html",
            status_code: 200,
            depth: this.currentDepth,
@@ -272,6 +275,51 @@ export class NewCrawlerService {
    }
  }
  /**
   * Compute a template hash from HTML — strips text content and dynamic attributes,
   * keeping only the DOM structure (tag names, class names, hierarchy).
   * Pages with the same layout (e.g., product pages) will share the same template_hash.
   */
  private async computeTemplateHash(html: string): Promise<string> {
    try {
      const { JSDOM: JSDOMParser } = await import("jsdom");
      const dom = new JSDOMParser(html);
      const skeleton = this.extractDomSkeleton(dom.window.document.body);
      return this.computeContentHash(skeleton);
    } catch {
      // Fallback: hash the raw HTML if JSDOM fails
      return this.computeContentHash(html);
    }
  }
  /**
   * Extract a structural skeleton of the DOM: tag names + class names only.
   * This ignores text content, ids, data attributes, images, etc.
   */
  private extractDomSkeleton(element: Element | null): string {
    if (!element) return "";
    const parts: string[] = [];
    const walk = (el: Element, depth: number) => {
      if (depth > 20) return; // prevent infinite recursion
      const tag = el.tagName.toLowerCase();
      // Skip script, style, svg, noscript — they're not layout
      if (["script", "style", "svg", "noscript"].includes(tag)) return;
      const classes = el.className && typeof el.className === "string"
        ? el.className.split(/\s+/).sort().join(".")
        : "";
      parts.push(`${" ".repeat(depth)}<${tag}${classes ? "." + classes : ""}>`);
      for (const child of Array.from(el.children)) {
        walk(child, depth + 1);
      }
    };
    walk(element, 0);
    return parts.join("\n");
  }
  private extractLinks(document: Document, baseUrl: string): string[] {
    const links = Array.from(document.querySelectorAll("a[href]"))
      .map((link) => {
@@ -105,7 +105,8 @@ export class ScanScheduler {
  }
  /**
-   * Process a single scheduled scan
+   * Process a single scheduled scan — scans ALL unique-layout pages, not just the main page.
   * Groups pages by template_hash to avoid scanning duplicate layouts (e.g., product pages).
   */
  private async processScheduledScan(scheduledScan: ScheduledScan): Promise<void> {
    try {
@@ -123,35 +124,54 @@ export class ScanScheduler {
        return;
      }
-      // Get the main page for this website
+      // Get ALL active pages for this website, grouped by template_hash
-      const { data: page, error: pageError } = await this.supabase
+      const { data: pages, error: pageError } = await this.supabase
        .from('pages')
-        .select('id')
+        .select('id, url, path, template_hash, depth, content_hash')
        .eq('website_id', scheduledScan.websiteId)
        .eq('is_active', true)
-        .order('created_at', { ascending: false })
+        .order('depth', { ascending: true })
-        .limit(1)
+        .order('created_at', { ascending: true });
        .single();
-      if (pageError || !page) {
+      if (pageError || !pages || pages.length === 0) {
-        logError('No active page found for scheduled scan', pageError, {
+        logError('No active pages found for scheduled scan', pageError, {
          websiteId: scheduledScan.websiteId,
        });
        return;
      }
-      // Perform scans for each device type
+      // Deduplicate pages by template_hash — scan only one page per unique layout
-      for (const deviceType of scheduledScan.deviceTypes) {
+      const uniquePages = this.deduplicateByLayout(pages);
        const scanConfig: ScanConfig = {
          websiteId: scheduledScan.websiteId,
          pageId: page.id as string,
          deviceType,
          categories: scheduledScan.categories,
          priority: 'medium',
          triggeredBy: 'scheduled',
        };
-        await lighthouseScanner.performScan(scanConfig);
+      console.info(JSON.stringify({
        level: 'info',
        event: 'scan_pages_selected',
        websiteId: scheduledScan.websiteId,
        totalPages: pages.length,
        uniqueLayouts: uniquePages.length,
        timestamp: new Date().toISOString(),
      }));
      // Scan each unique page
      for (const page of uniquePages) {
        // Re-check limits before each scan
        const { canScan: stillCanScan } = await lighthouseScanner.checkSubscriptionLimits(
          scheduledScan.organizationId
        );
        if (!stillCanScan) break;
        for (const deviceType of scheduledScan.deviceTypes) {
          const scanConfig: ScanConfig = {
            websiteId: scheduledScan.websiteId,
            pageId: page.id as string,
            deviceType,
            categories: scheduledScan.categories,
            priority: (page.depth as number) === 0 ? 'high' : 'medium',
            triggeredBy: 'scheduled',
          };
          await lighthouseScanner.performScan(scanConfig);
        }
      }
      // Update the last run time
@@ -161,6 +181,45 @@ export class ScanScheduler {
    }
  }
  /**
   * Deduplicate pages by template_hash.
   * If a template_hash exists, only scan the shallowest (lowest depth) page with that hash.
   * Pages without template_hash are always included (treated as unique).
   * The main page (depth=0) is always included.
   */
  private deduplicateByLayout(pages: Record<string, unknown>[]): Record<string, unknown>[] {
    const seenHashes = new Set<string>();
    const result: Record<string, unknown>[] = [];
    for (const page of pages) {
      const depth = Number(page.depth ?? 0);
      const templateHash = page.template_hash as string | null;
      // Always include the main page
      if (depth === 0) {
        if (templateHash) seenHashes.add(templateHash);
        result.push(page);
        continue;
      }
      // If no template_hash, fall back to content_hash for dedup
      const hash = templateHash || (page.content_hash as string | null);
      if (!hash) {
        // No hash at all — include it (unique by default)
        result.push(page);
        continue;
      }
      if (!seenHashes.has(hash)) {
        seenHashes.add(hash);
        result.push(page);
      }
    }
    return result;
  }
  /**
   * Check for website changes and trigger scans if needed
   */
@@ -213,24 +272,23 @@ export class ScanScheduler {
        return;
      }
-      // Get the main page
+      // Get all unique-layout pages (not just main)
-      const { data: page, error: pageError } = await this.supabase
+      const { data: pages, error: pageError } = await this.supabase
        .from('pages')
-        .select('id')
+        .select('id, url, path, template_hash, depth, content_hash')
        .eq('website_id', website.id)
        .eq('is_active', true)
-        .order('created_at', { ascending: false })
+        .order('depth', { ascending: true });
        .limit(1)
        .single();
-      if (pageError || !page) {
+      if (pageError || !pages || pages.length === 0) {
        return;
      }
-      // Trigger a high-priority scan due to changes
+      // Scan main page with high priority on change
      const mainPage = pages[0];
      const scanConfig: ScanConfig = {
        websiteId: website.id,
-        pageId: page.id as string,
+        pageId: mainPage.id as string,
        deviceType: 'desktop', // Start with desktop for change detection
        categories: ['performance', 'accessibility', 'seo', 'best_practices'],
        priority: 'high',