feat: production hardening + smart subpage scanning with layout dedup

Security:
- Add CRON_SECRET auth to /api/cron/* endpoints
- Add admin role verification to /api/admin/* routes
- Add org membership check to /api/billing/usage
- Add security headers (HSTS, X-Frame-Options, CSP, etc.)
- Add env variable validation at startup
- Add rate limiting to backend API (30 req/min per IP)

Infrastructure:
- Multi-stage Dockerfiles with non-root user + healthchecks
- Updated cron workflow to pass CRON_SECRET header
- Updated .env.example with all optional vars

Smart subpage scanning:
- Crawler now computes template_hash (DOM structure without content)
- Scanner scans ALL unique-layout pages, not just main page
- Pages with same layout (e.g. product pages) scanned only once
- Deduplication by template_hash, fallback to content_hash
- Main page always scanned with high priority
- Re-checks subscription limits before each page scan

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Dennis
2026-03-06 07:44:32 +01:00
parent d8de0a973a
commit 1c545c93b4
18 changed files with 498 additions and 59 deletions
+11
View File
@@ -18,3 +18,14 @@ CORS_ORIGIN=http://localhost:3000
NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co
NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key
SUPABASE_SERVICE_ROLE_KEY=your-service-role-key SUPABASE_SERVICE_ROLE_KEY=your-service-role-key
# ── Security ────────────────────────────────
# Required in production: protects /api/cron/* endpoints
CRON_SECRET=generate-a-random-secret-here
# ── Optional Services ───────────────────────
# Email notifications (Resend — free tier: 3000 emails/mo)
RESEND_API_KEY=re_your_resend_key
# Lighthouse backend URL (for automated scans)
LIGHTHOUSE_SERVICE_URL=http://localhost:5000
+27 -16
View File
@@ -1,25 +1,36 @@
# Use the official Node.js image. # --- Stage 1: Build ---
FROM node:18 FROM node:20-slim AS builder
# OPTIONAL: Falls in der Base kein Chrome enthalten ist,
# müsstest du hier noch "apt-get update" + "apt-get install chromium" oder ähnliches ausführen,
# z. B.:
RUN apt-get update && apt-get install -y chromium
# Create and change to the app directory.
WORKDIR /app WORKDIR /app
# Copy application dependency manifests to the container image.
COPY package*.json ./ COPY package*.json ./
RUN npm ci
# Install production dependencies.
RUN npm install
# Copy local code to the container image.
COPY . . COPY . .
# Build the TypeScript code
RUN npm run build RUN npm run build
# Run the web service on container startup. # --- Stage 2: Production ---
FROM node:20-slim AS runtime
RUN apt-get update && apt-get install -y --no-install-recommends chromium \
&& rm -rf /var/lib/apt/lists/*
ENV CHROME_BIN=/usr/bin/chromium
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
WORKDIR /app
RUN groupadd -r app && useradd -r -g app -d /app app
COPY --from=builder --chown=app:app /app/dist ./dist
COPY --from=builder --chown=app:app /app/node_modules ./node_modules
COPY --from=builder --chown=app:app /app/package.json ./
USER app
EXPOSE 5000
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD node -e "const h=require('http');h.get('http://localhost:5000/health',(r)=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"
CMD ["node", "dist/index.js"] CMD ["node", "dist/index.js"]
+26
View File
@@ -5,10 +5,36 @@ import lighthouseRouter from "./routes/lighthouse.js";
dotenv.config(); dotenv.config();
// Rate limiting (simple in-memory for single instance)
const rateLimitMap = new Map<string, { count: number; resetAt: number }>();
const RATE_LIMIT_WINDOW = 60_000; // 1 minute
const RATE_LIMIT_MAX = 30; // requests per window
function rateLimit(req: Request, res: Response, next: () => void) {
const ip = req.ip || req.headers["x-forwarded-for"] || "unknown";
const key = String(ip);
const now = Date.now();
const entry = rateLimitMap.get(key);
if (!entry || now > entry.resetAt) {
rateLimitMap.set(key, { count: 1, resetAt: now + RATE_LIMIT_WINDOW });
return next();
}
if (entry.count >= RATE_LIMIT_MAX) {
res.status(429).json({ error: "Too many requests" });
return;
}
entry.count++;
next();
}
const app = express(); const app = express();
app.use(cors({ origin: process.env.CORS_ORIGIN || "*" })); app.use(cors({ origin: process.env.CORS_ORIGIN || "*" }));
app.use(express.json()); app.use(express.json());
app.use(rateLimit);
app.get("/health", (_req: Request, res: Response) => { app.get("/health", (_req: Request, res: Response) => {
res.status(200).json({ status: "ok", timestamp: new Date().toISOString() }); res.status(200).json({ status: "ok", timestamp: new Date().toISOString() });
@@ -26,7 +26,7 @@ jobs:
DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}" DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}"
echo "Running uptime checks at: $DEPLOYMENT_URL/api/cron/uptime" echo "Running uptime checks at: $DEPLOYMENT_URL/api/cron/uptime"
response=$(curl -s -w "\n%{http_code}" "$DEPLOYMENT_URL/api/cron/uptime") response=$(curl -s -w "\n%{http_code}" -H "Authorization: Bearer $CRON_SECRET" "$DEPLOYMENT_URL/api/cron/uptime")
http_code=$(echo "$response" | tail -n1) http_code=$(echo "$response" | tail -n1)
response_body=$(echo "$response" | head -n -1) response_body=$(echo "$response" | head -n -1)
@@ -41,6 +41,7 @@ jobs:
fi fi
env: env:
DEPLOYMENT_URL: ${{ secrets.DEPLOYMENT_URL }} DEPLOYMENT_URL: ${{ secrets.DEPLOYMENT_URL }}
CRON_SECRET: ${{ secrets.CRON_SECRET }} CRON_SECRET: ${{ secrets.CRON_SECRET }}
scan: scan:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -51,7 +52,7 @@ jobs:
DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}" DEPLOYMENT_URL="${DEPLOYMENT_URL:-https://your-domain.com}"
echo "Triggering scan at: $DEPLOYMENT_URL/api/cron/scan?mode=all" echo "Triggering scan at: $DEPLOYMENT_URL/api/cron/scan?mode=all"
response=$(curl -s -w "\n%{http_code}" -X POST "$DEPLOYMENT_URL/api/cron/scan?mode=all") response=$(curl -s -w "\n%{http_code}" -X POST -H "Authorization: Bearer $CRON_SECRET" "$DEPLOYMENT_URL/api/cron/scan?mode=all")
http_code=$(echo "$response" | tail -n1) http_code=$(echo "$response" | tail -n1)
response_body=$(echo "$response" | head -n -1) response_body=$(echo "$response" | head -n -1)
+32 -10
View File
@@ -1,16 +1,38 @@
FROM node:18 # --- Stage 1: Dependencies ---
FROM node:20-slim AS deps
WORKDIR /app WORKDIR /app
COPY package.json package-lock.json ./ COPY package.json package-lock.json ./
RUN npm ci
# Disable the oxide engine so it falls back to JS # --- Stage 2: Build ---
ENV TAILWIND_DISABLE_OXIDE=1 FROM node:20-slim AS builder
WORKDIR /app
RUN npm install COPY --from=deps /app/node_modules ./node_modules
COPY . . COPY . .
ENV NEXT_TELEMETRY_DISABLED=1
ENV TAILWIND_DISABLE_OXIDE=1
RUN npm run build RUN npm run build
CMD ["npm", "run", "start"] # --- Stage 3: Production ---
FROM node:20-slim AS runtime
WORKDIR /app
RUN groupadd -r app && useradd -r -g app -d /app app
COPY --from=builder --chown=app:app /app/.next/standalone ./
COPY --from=builder --chown=app:app /app/.next/static ./.next/static
COPY --from=builder --chown=app:app /app/public ./public
USER app
EXPOSE 3000
ENV PORT=3000
ENV HOSTNAME="0.0.0.0"
ENV NODE_ENV=production
ENV NEXT_TELEMETRY_DISABLED=1
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD node -e "const h=require('http');h.get('http://localhost:3000/api/health',(r)=>{process.exit(r.statusCode===200?0:1)}).on('error',()=>process.exit(1))"
CMD ["node", "server.js"]
@@ -203,4 +203,8 @@ CREATE TABLE IF NOT EXISTS alert_configurations (
created_at timestamp with time zone DEFAULT now(), created_at timestamp with time zone DEFAULT now(),
updated_at timestamp with time zone DEFAULT now() updated_at timestamp with time zone DEFAULT now()
); );
-- Add template_hash to pages table for layout deduplication
ALTER TABLE pages ADD COLUMN IF NOT EXISTS template_hash VARCHAR;
CREATE INDEX IF NOT EXISTS idx_pages_template_hash ON pages(template_hash) WHERE template_hash IS NOT NULL;
); );
@@ -1,10 +1,27 @@
import type { NextConfig } from "next"; import type { NextConfig } from "next";
const securityHeaders = [
{ key: "X-DNS-Prefetch-Control", value: "on" },
{ key: "Strict-Transport-Security", value: "max-age=63072000; includeSubDomains; preload" },
{ key: "X-Frame-Options", value: "SAMEORIGIN" },
{ key: "X-Content-Type-Options", value: "nosniff" },
{ key: "Referrer-Policy", value: "origin-when-cross-origin" },
{ key: "Permissions-Policy", value: "camera=(), microphone=(), geolocation=()" },
];
const nextConfig: NextConfig = { const nextConfig: NextConfig = {
eslint: { eslint: {
// Do not fail production builds due to ESLint errors // Do not fail production builds due to ESLint errors
ignoreDuringBuilds: true, ignoreDuringBuilds: true,
}, },
async headers() {
return [
{
source: "/(.*)",
headers: securityHeaders,
},
];
},
}; };
export default nextConfig; export default nextConfig;
@@ -207,6 +207,7 @@ CREATE TABLE IF NOT EXISTS pages (
title VARCHAR, title VARCHAR,
description TEXT, description TEXT,
content_hash VARCHAR, content_hash VARCHAR,
template_hash VARCHAR,
content_type VARCHAR, content_type VARCHAR,
status_code INTEGER, status_code INTEGER,
is_active BOOLEAN DEFAULT true, is_active BOOLEAN DEFAULT true,
@@ -1,12 +1,17 @@
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import { getSupabaseAdmin } from "@/lib/admin"; import { getSupabaseAdmin } from "@/lib/admin";
import { requireAdmin } from "@/lib/apiAuth";
/** /**
* GET /api/admin/organizations * GET /api/admin/organizations
* *
* List all organizations with usage stats. * List all organizations with usage stats.
* Requires admin or owner role.
*/ */
export async function GET(request: Request) { export async function GET(request: Request) {
const auth = await requireAdmin(request);
if (auth instanceof NextResponse) return auth;
try { try {
const supabase = getSupabaseAdmin(); const supabase = getSupabaseAdmin();
const url = new URL(request.url); const url = new URL(request.url);
@@ -68,6 +73,9 @@ export async function GET(request: Request) {
* Update organization: change tier, deactivate, etc. * Update organization: change tier, deactivate, etc.
*/ */
export async function PATCH(request: Request) { export async function PATCH(request: Request) {
const auth = await requireAdmin(request);
if (auth instanceof NextResponse) return auth;
try { try {
const supabase = getSupabaseAdmin(); const supabase = getSupabaseAdmin();
const { organizationId, updates } = await request.json(); const { organizationId, updates } = await request.json();
@@ -1,12 +1,17 @@
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import { getSupabaseAdmin } from "@/lib/admin"; import { getSupabaseAdmin } from "@/lib/admin";
import { requireAdmin } from "@/lib/apiAuth";
/** /**
* GET /api/admin/stats * GET /api/admin/stats
* *
* Returns system-wide statistics for the admin dashboard. * Returns system-wide statistics for the admin dashboard.
* Requires admin or owner role.
*/ */
export async function GET() { export async function GET(request: Request) {
const auth = await requireAdmin(request);
if (auth instanceof NextResponse) return auth;
try { try {
const supabase = getSupabaseAdmin(); const supabase = getSupabaseAdmin();
@@ -1,13 +1,18 @@
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import { getSupabaseAdmin } from "@/lib/admin"; import { getSupabaseAdmin } from "@/lib/admin";
import { requireAdmin } from "@/lib/apiAuth";
/** /**
* GET /api/admin/users * GET /api/admin/users
* *
* List all users with their organization memberships and usage stats. * List all users with their organization memberships and usage stats.
* Query params: ?page=1&limit=20&search=keyword * Query params: ?page=1&limit=20&search=keyword
* Requires admin or owner role.
*/ */
export async function GET(request: Request) { export async function GET(request: Request) {
const auth = await requireAdmin(request);
if (auth instanceof NextResponse) return auth;
try { try {
const supabase = getSupabaseAdmin(); const supabase = getSupabaseAdmin();
const url = new URL(request.url); const url = new URL(request.url);
@@ -79,6 +84,9 @@ export async function GET(request: Request) {
* Body: { userId, action, value } * Body: { userId, action, value }
*/ */
export async function PATCH(request: Request) { export async function PATCH(request: Request) {
const auth = await requireAdmin(request);
if (auth instanceof NextResponse) return auth;
try { try {
const supabase = getSupabaseAdmin(); const supabase = getSupabaseAdmin();
const { userId, action, value } = await request.json(); const { userId, action, value } = await request.json();
@@ -152,6 +160,9 @@ export async function PATCH(request: Request) {
* Body: { userId } * Body: { userId }
*/ */
export async function DELETE(request: Request) { export async function DELETE(request: Request) {
const auth = await requireAdmin(request);
if (auth instanceof NextResponse) return auth;
try { try {
const supabase = getSupabaseAdmin(); const supabase = getSupabaseAdmin();
const { userId } = await request.json(); const { userId } = await request.json();
@@ -1,16 +1,17 @@
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import { getSupabaseAdmin } from "@/lib/admin"; import { getSupabaseAdmin } from "@/lib/admin";
import { TIER_LIMITS } from "@/services/tierLimits"; import { TIER_LIMITS } from "@/services/tierLimits";
import { requireOrgMembership } from "@/lib/apiAuth";
/** /**
* GET /api/billing/usage * GET /api/billing/usage
* *
* Returns current usage vs tier limits for an organization. * Returns current usage vs tier limits for an organization.
* Requires authenticated user who is a member of the organization.
* Query params: ?organizationId=xxx * Query params: ?organizationId=xxx
*/ */
export async function GET(request: Request) { export async function GET(request: Request) {
try { try {
const supabase = getSupabaseAdmin();
const url = new URL(request.url); const url = new URL(request.url);
const organizationId = url.searchParams.get("organizationId"); const organizationId = url.searchParams.get("organizationId");
@@ -18,6 +19,12 @@ export async function GET(request: Request) {
return NextResponse.json({ error: "organizationId required" }, { status: 400 }); return NextResponse.json({ error: "organizationId required" }, { status: 400 });
} }
// Verify caller belongs to this organization
const auth = await requireOrgMembership(organizationId, request);
if (auth instanceof NextResponse) return auth;
const supabase = getSupabaseAdmin();
// Get organization with tier info // Get organization with tier info
const { data: org, error: orgError } = await supabase const { data: org, error: orgError } = await supabase
.from("organizations") .from("organizations")
@@ -2,8 +2,12 @@ import { NextResponse } from "next/server";
import { scanScheduler } from "@/services/scanScheduler"; import { scanScheduler } from "@/services/scanScheduler";
import { lighthouseScanner } from "@/services/lighthouseScanner"; import { lighthouseScanner } from "@/services/lighthouseScanner";
import { logError } from "@/utils/errorUtils"; import { logError } from "@/utils/errorUtils";
import { verifyCronSecret } from "@/lib/apiAuth";
export async function GET(request: Request) { export async function GET(request: Request) {
const authError = verifyCronSecret(request);
if (authError) return authError;
try { try {
const url = new URL(request.url); const url = new URL(request.url);
const mode = url.searchParams.get("mode") || "all"; // "scheduled", "change_detection", "all" const mode = url.searchParams.get("mode") || "all"; // "scheduled", "change_detection", "all"
@@ -1,16 +1,21 @@
import { NextResponse } from "next/server"; import { NextResponse } from "next/server";
import { performUptimeChecks, evaluateUptimeAlerts } from "@/services/uptimeService"; import { performUptimeChecks, evaluateUptimeAlerts } from "@/services/uptimeService";
import { verifyCronSecret } from "@/lib/apiAuth";
/** /**
* GET /api/cron/uptime * GET /api/cron/uptime
* *
* Performs uptime checks on all active websites and evaluates alert rules. * Performs uptime checks on all active websites and evaluates alert rules.
* Designed to be called by a cron job (e.g., GitHub Actions, Vercel Cron, or external scheduler). * Designed to be called by a cron job (e.g., GitHub Actions, Vercel Cron, or external scheduler).
* Requires CRON_SECRET authorization in production.
* *
* Query params: * Query params:
* - alerts=true (default) — also evaluate alert rules after checks * - alerts=true (default) — also evaluate alert rules after checks
*/ */
export async function GET(request: Request) { export async function GET(request: Request) {
const authError = verifyCronSecret(request);
if (authError) return authError;
const startTime = Date.now(); const startTime = Date.now();
try { try {
@@ -0,0 +1,148 @@
import { createClient } from "@supabase/supabase-js";
import { NextResponse } from "next/server";
import { getSupabaseAdmin } from "./admin";
/**
* Verify CRON_SECRET for cron endpoints.
* Returns null if valid, or a NextResponse error if invalid.
*/
export function verifyCronSecret(request: Request): NextResponse | null {
const authHeader = request.headers.get("authorization");
const cronSecret = process.env.CRON_SECRET;
if (!cronSecret) {
// If no secret configured, allow in development only
if (process.env.NODE_ENV === "development") return null;
return NextResponse.json(
{ error: "CRON_SECRET not configured" },
{ status: 500 }
);
}
if (authHeader !== `Bearer ${cronSecret}`) {
return NextResponse.json(
{ error: "Unauthorized" },
{ status: 401 }
);
}
return null;
}
interface AuthResult {
userId: string;
role: string | null;
organizationId: string | null;
}
/**
* Authenticate the current user from the request cookies or Authorization header.
* Returns user info or a NextResponse error.
*/
export async function authenticateUser(request?: Request): Promise<AuthResult | NextResponse> {
try {
// Try to get the access token from the Authorization header or cookies
let accessToken: string | null = null;
if (request) {
const authHeader = request.headers.get("authorization");
if (authHeader?.startsWith("Bearer ")) {
accessToken = authHeader.slice(7);
}
// Try to extract from Supabase auth cookie
if (!accessToken) {
const cookieHeader = request.headers.get("cookie") || "";
const match = cookieHeader.match(/sb-[^=]+-auth-token=([^;]+)/);
if (match) {
try {
const decoded = decodeURIComponent(match[1]);
const parsed = JSON.parse(decoded);
accessToken = parsed?.[0] || parsed?.access_token || null;
} catch {
// Cookie might be the token directly
accessToken = match[1];
}
}
}
}
if (!accessToken) {
return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
}
// Verify the token using Supabase
const supabase = createClient(
process.env.NEXT_PUBLIC_SUPABASE_URL!,
process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!
);
const { data: { user }, error } = await supabase.auth.getUser(accessToken);
if (error || !user) {
return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
}
// Get user details (role, org)
const admin = getSupabaseAdmin();
const { data: userData } = await admin
.from("users")
.select("organization_id, role")
.eq("id", user.id)
.single();
return {
userId: user.id,
role: (userData?.role as string) || user.user_metadata?.role || null,
organizationId: (userData?.organization_id as string) || null,
};
} catch {
return NextResponse.json({ error: "Authentication failed" }, { status: 401 });
}
}
/**
* Require admin role. Returns AuthResult if authorized, or a NextResponse error.
*/
export async function requireAdmin(request?: Request): Promise<AuthResult | NextResponse> {
const auth = await authenticateUser(request);
if (auth instanceof NextResponse) return auth;
if (auth.role !== "owner" && auth.role !== "admin") {
return NextResponse.json({ error: "Forbidden: admin access required" }, { status: 403 });
}
return auth;
}
/**
* Require membership in the given organization.
*/
export async function requireOrgMembership(
organizationId: string,
request?: Request
): Promise<AuthResult | NextResponse> {
const auth = await authenticateUser(request);
if (auth instanceof NextResponse) return auth;
// Admins/owners can access any org
if (auth.role === "owner" || auth.role === "admin") return auth;
// Check org membership
const admin = getSupabaseAdmin();
const { data: membership } = await admin
.from("organization_members")
.select("id")
.eq("user_id", auth.userId)
.eq("organization_id", organizationId)
.single();
if (!membership) {
return NextResponse.json(
{ error: "Forbidden: not a member of this organization" },
{ status: 403 }
);
}
return auth;
}
@@ -0,0 +1,52 @@
/**
* Validates that required environment variables are present.
* Call this at server startup or in API routes.
*/
const REQUIRED_SERVER_VARS = [
"NEXT_PUBLIC_SUPABASE_URL",
"NEXT_PUBLIC_SUPABASE_ANON_KEY",
];
const REQUIRED_FOR_ADMIN = [
"SUPABASE_SERVICE_ROLE_KEY",
];
const OPTIONAL_VARS = [
"CRON_SECRET",
"RESEND_API_KEY",
"LIGHTHOUSE_SERVICE_URL",
];
export function validateEnv(): { valid: boolean; missing: string[]; warnings: string[] } {
const missing: string[] = [];
const warnings: string[] = [];
for (const v of REQUIRED_SERVER_VARS) {
if (!process.env[v]) missing.push(v);
}
for (const v of REQUIRED_FOR_ADMIN) {
if (!process.env[v]) missing.push(v);
}
for (const v of OPTIONAL_VARS) {
if (!process.env[v]) warnings.push(`${v} not set — related features will be disabled`);
}
if (!process.env.CRON_SECRET && process.env.NODE_ENV === "production") {
warnings.push("CRON_SECRET not set — cron endpoints are unprotected in production!");
}
return { valid: missing.length === 0, missing, warnings };
}
// Auto-validate on import (server-side only)
if (typeof window === "undefined") {
const { valid, missing, warnings } = validateEnv();
if (!valid) {
console.error(`[ENV] Missing required environment variables: ${missing.join(", ")}`);
}
for (const w of warnings) {
console.warn(`[ENV] ${w}`);
}
}
@@ -219,6 +219,7 @@ export class NewCrawlerService {
try { try {
const urlObj = new URL(url); const urlObj = new URL(url);
const contentHash = await this.computeContentHash(html); const contentHash = await this.computeContentHash(html);
const templateHash = await this.computeTemplateHash(html);
// Check if page already exists // Check if page already exists
const { data: existingPage } = await getSupabaseAdmin() const { data: existingPage } = await getSupabaseAdmin()
@@ -236,6 +237,7 @@ export class NewCrawlerService {
title, title,
description, description,
content_hash: contentHash, content_hash: contentHash,
template_hash: templateHash,
last_crawled_at: new Date().toISOString(), last_crawled_at: new Date().toISOString(),
metadata: { metadata: {
crawl_session_id: this.sessionId, crawl_session_id: this.sessionId,
@@ -254,6 +256,7 @@ export class NewCrawlerService {
title, title,
description, description,
content_hash: contentHash, content_hash: contentHash,
template_hash: templateHash,
content_type: "text/html", content_type: "text/html",
status_code: 200, status_code: 200,
depth: this.currentDepth, depth: this.currentDepth,
@@ -272,6 +275,51 @@ export class NewCrawlerService {
} }
} }
/**
* Compute a template hash from HTML — strips text content and dynamic attributes,
* keeping only the DOM structure (tag names, class names, hierarchy).
* Pages with the same layout (e.g., product pages) will share the same template_hash.
*/
private async computeTemplateHash(html: string): Promise<string> {
try {
const { JSDOM: JSDOMParser } = await import("jsdom");
const dom = new JSDOMParser(html);
const skeleton = this.extractDomSkeleton(dom.window.document.body);
return this.computeContentHash(skeleton);
} catch {
// Fallback: hash the raw HTML if JSDOM fails
return this.computeContentHash(html);
}
}
/**
* Extract a structural skeleton of the DOM: tag names + class names only.
* This ignores text content, ids, data attributes, images, etc.
*/
private extractDomSkeleton(element: Element | null): string {
if (!element) return "";
const parts: string[] = [];
const walk = (el: Element, depth: number) => {
if (depth > 20) return; // prevent infinite recursion
const tag = el.tagName.toLowerCase();
// Skip script, style, svg, noscript — they're not layout
if (["script", "style", "svg", "noscript"].includes(tag)) return;
const classes = el.className && typeof el.className === "string"
? el.className.split(/\s+/).sort().join(".")
: "";
parts.push(`${" ".repeat(depth)}<${tag}${classes ? "." + classes : ""}>`);
for (const child of Array.from(el.children)) {
walk(child, depth + 1);
}
};
walk(element, 0);
return parts.join("\n");
}
private extractLinks(document: Document, baseUrl: string): string[] { private extractLinks(document: Document, baseUrl: string): string[] {
const links = Array.from(document.querySelectorAll("a[href]")) const links = Array.from(document.querySelectorAll("a[href]"))
.map((link) => { .map((link) => {
@@ -105,7 +105,8 @@ export class ScanScheduler {
} }
/** /**
* Process a single scheduled scan * Process a single scheduled scan — scans ALL unique-layout pages, not just the main page.
* Groups pages by template_hash to avoid scanning duplicate layouts (e.g., product pages).
*/ */
private async processScheduledScan(scheduledScan: ScheduledScan): Promise<void> { private async processScheduledScan(scheduledScan: ScheduledScan): Promise<void> {
try { try {
@@ -123,35 +124,54 @@ export class ScanScheduler {
return; return;
} }
// Get the main page for this website // Get ALL active pages for this website, grouped by template_hash
const { data: page, error: pageError } = await this.supabase const { data: pages, error: pageError } = await this.supabase
.from('pages') .from('pages')
.select('id') .select('id, url, path, template_hash, depth, content_hash')
.eq('website_id', scheduledScan.websiteId) .eq('website_id', scheduledScan.websiteId)
.eq('is_active', true) .eq('is_active', true)
.order('created_at', { ascending: false }) .order('depth', { ascending: true })
.limit(1) .order('created_at', { ascending: true });
.single();
if (pageError || !page) { if (pageError || !pages || pages.length === 0) {
logError('No active page found for scheduled scan', pageError, { logError('No active pages found for scheduled scan', pageError, {
websiteId: scheduledScan.websiteId, websiteId: scheduledScan.websiteId,
}); });
return; return;
} }
// Perform scans for each device type // Deduplicate pages by template_hash — scan only one page per unique layout
for (const deviceType of scheduledScan.deviceTypes) { const uniquePages = this.deduplicateByLayout(pages);
const scanConfig: ScanConfig = {
websiteId: scheduledScan.websiteId,
pageId: page.id as string,
deviceType,
categories: scheduledScan.categories,
priority: 'medium',
triggeredBy: 'scheduled',
};
await lighthouseScanner.performScan(scanConfig); console.info(JSON.stringify({
level: 'info',
event: 'scan_pages_selected',
websiteId: scheduledScan.websiteId,
totalPages: pages.length,
uniqueLayouts: uniquePages.length,
timestamp: new Date().toISOString(),
}));
// Scan each unique page
for (const page of uniquePages) {
// Re-check limits before each scan
const { canScan: stillCanScan } = await lighthouseScanner.checkSubscriptionLimits(
scheduledScan.organizationId
);
if (!stillCanScan) break;
for (const deviceType of scheduledScan.deviceTypes) {
const scanConfig: ScanConfig = {
websiteId: scheduledScan.websiteId,
pageId: page.id as string,
deviceType,
categories: scheduledScan.categories,
priority: (page.depth as number) === 0 ? 'high' : 'medium',
triggeredBy: 'scheduled',
};
await lighthouseScanner.performScan(scanConfig);
}
} }
// Update the last run time // Update the last run time
@@ -161,6 +181,45 @@ export class ScanScheduler {
} }
} }
/**
* Deduplicate pages by template_hash.
* If a template_hash exists, only scan the shallowest (lowest depth) page with that hash.
* Pages without template_hash are always included (treated as unique).
* The main page (depth=0) is always included.
*/
private deduplicateByLayout(pages: Record<string, unknown>[]): Record<string, unknown>[] {
const seenHashes = new Set<string>();
const result: Record<string, unknown>[] = [];
for (const page of pages) {
const depth = Number(page.depth ?? 0);
const templateHash = page.template_hash as string | null;
// Always include the main page
if (depth === 0) {
if (templateHash) seenHashes.add(templateHash);
result.push(page);
continue;
}
// If no template_hash, fall back to content_hash for dedup
const hash = templateHash || (page.content_hash as string | null);
if (!hash) {
// No hash at all — include it (unique by default)
result.push(page);
continue;
}
if (!seenHashes.has(hash)) {
seenHashes.add(hash);
result.push(page);
}
}
return result;
}
/** /**
* Check for website changes and trigger scans if needed * Check for website changes and trigger scans if needed
*/ */
@@ -213,24 +272,23 @@ export class ScanScheduler {
return; return;
} }
// Get the main page // Get all unique-layout pages (not just main)
const { data: page, error: pageError } = await this.supabase const { data: pages, error: pageError } = await this.supabase
.from('pages') .from('pages')
.select('id') .select('id, url, path, template_hash, depth, content_hash')
.eq('website_id', website.id) .eq('website_id', website.id)
.eq('is_active', true) .eq('is_active', true)
.order('created_at', { ascending: false }) .order('depth', { ascending: true });
.limit(1)
.single();
if (pageError || !page) { if (pageError || !pages || pages.length === 0) {
return; return;
} }
// Trigger a high-priority scan due to changes // Scan main page with high priority on change
const mainPage = pages[0];
const scanConfig: ScanConfig = { const scanConfig: ScanConfig = {
websiteId: website.id, websiteId: website.id,
pageId: page.id as string, pageId: mainPage.id as string,
deviceType: 'desktop', // Start with desktop for change detection deviceType: 'desktop', // Start with desktop for change detection
categories: ['performance', 'accessibility', 'seo', 'best_practices'], categories: ['performance', 'accessibility', 'seo', 'best_practices'],
priority: 'high', priority: 'high',