feat: implement core proxy server, crawler, and indexer modules

packages/shared:
- Zod v4 schemas for TopicConfig, ProxyConfig, CrawlJob, SearchQuery
- Config loader with defaults
- Utility functions (createId, formatBytes, normalizeUrl)

packages/core:
- WebProxyServer: HTTP forward proxy using http-proxy-3
- CacheStore: LRU-based in-memory + disk cache for proxied responses
- WarcWriter: WARC file archiving for all proxied content
- HTTPS CONNECT tunneling for SSL passthrough
- Admin API with /api/status, /api/cache/stats, /api/config

packages/indexer:
- TopicCrawler: Crawlee CheerioCrawler for topic-based web crawling
- ContentExtractor: @mozilla/readability + turndown for clean text/markdown
- SearchClient: MeiliSearch integration for full-text search
- CrawlScheduler: Interval-based crawl job scheduling

apps/proxy:
- Main entry point orchestrating all components
- Graceful shutdown handling
- Proxy-only mode when no topics configured

All packages type-check clean. Next.js build passes.

Co-Authored-By: UnicornDev <noreply@unicorndev.wtf>
This commit is contained in:
Jeremy Meyer 2026-02-26 19:04:10 -08:00
parent a2bae5c98e
commit 9386d4a7b3
28 changed files with 5066 additions and 41 deletions

View File

@ -13,6 +13,12 @@
"dependencies": { "dependencies": {
"@webproxy/core": "workspace:*", "@webproxy/core": "workspace:*",
"@webproxy/indexer": "workspace:*", "@webproxy/indexer": "workspace:*",
"@webproxy/shared": "workspace:*" "@webproxy/shared": "workspace:*",
"zod": "^4.3.6"
},
"devDependencies": {
"@types/node": "^20.19.35",
"tsx": "^4.21.0",
"typescript": "^5.9.3"
} }
} }

View File

@ -1,11 +1,154 @@
/** /**
* @file index * @file index
* @description WebProxy server entry point * @description WebProxy main entry point - starts proxy, crawler, and admin API
* @layer Application * @layer Application
* *
* Starts the proxy server, indexer, and serves cached content to network devices. * Orchestrates all components: HTTP proxy server, crawl scheduler,
* This is a placeholder full implementation coming in Phase 2. * search client, and admin API. Handles graceful shutdown.
*/ */
console.log("WebProxy v0.1.0 — proxy server placeholder"); import { loadConfig, formatBytes, type ProxyConfig } from "@webproxy/shared";
console.log("Run `pnpm dev` in apps/web for the landing page"); import { WebProxyServer, Logger } from "@webproxy/core";
import { TopicCrawler, CrawlScheduler, SearchClient, ContentExtractor } from "@webproxy/indexer";
const log = new Logger("WebProxy");
async function main() {
log.info("WebProxy v0.1.0 starting...");
// Load config
const configPath = process.argv[2] ?? undefined;
let config: ProxyConfig;
try {
config = loadConfig(configPath);
log.info("Configuration loaded", {
proxy: `${config.server.host}:${config.server.port}`,
admin: `${config.server.host}:${config.server.adminPort}`,
topics: config.topics.length,
cacheDir: config.cache.dir,
});
} catch (err) {
log.error("Failed to load config", { error: String(err) });
process.exit(1);
}
// Initialize search client (optional - works without MeiliSearch)
const search = new SearchClient({
host: config.search.host,
apiKey: config.search.apiKey,
indexName: config.search.indexName,
});
const searchAvailable = await search.init();
if (searchAvailable) {
log.info("MeiliSearch connected", { host: config.search.host });
} else {
log.warn("MeiliSearch not available - search will be disabled");
}
// Initialize proxy server
const proxy = new WebProxyServer(config);
// Set up event logging
proxy.on("request", (event) => {
const cached = event.cached ? " [CACHED]" : "";
log.info(
`${event.method} ${event.statusCode} ${event.url} ${formatBytes(event.size)} ${event.duration}ms${cached}`,
);
});
proxy.on("cacheHit", (url, size) => {
log.debug(`Cache HIT: ${url} (${formatBytes(size)})`);
});
proxy.on("cacheStore", (url, size) => {
log.debug(`Cached: ${url} (${formatBytes(size)})`);
});
proxy.on("error", (err, url) => {
log.error(`Error: ${url ?? "unknown"}`, { error: err.message });
});
// Initialize crawler and scheduler
const crawler = new TopicCrawler({ search });
const extractor = new ContentExtractor();
const scheduler = new CrawlScheduler(crawler, {
onPageCrawled: async (result, extracted) => {
// Also store in proxy cache for serving to network devices
proxy.cacheStore.set(result.url, {
url: result.url,
fetchedAt: result.fetchedAt,
expiresAt: new Date(Date.now() + config.cache.maxAge),
contentType: result.contentType,
statusCode: result.statusCode,
headers: result.headers,
size: result.size,
}, Buffer.from(result.rawHtml, "utf-8"));
// Write to WARC archive
await proxy.warcWriter.write({
url: result.url,
method: "GET",
requestHeaders: { "User-Agent": "WebProxy/0.1" },
statusCode: result.statusCode,
responseHeaders: result.headers,
body: Buffer.from(result.rawHtml, "utf-8"),
timestamp: result.fetchedAt,
});
},
onJobComplete: (job) => {
log.info(`Crawl job ${job.id} completed`, {
topic: job.topicId,
pages: job.pagesProcessed,
failed: job.pagesFailed,
bytes: formatBytes(job.bytesDownloaded),
});
},
onError: (err, url) => {
log.warn(`Crawl error: ${url}`, { error: err.message });
},
});
// Add configured topics to scheduler
for (const topic of config.topics) {
scheduler.addTopic(topic);
log.info(`Topic registered: ${topic.name}`, {
urls: topic.seedUrls.length,
interval: `${topic.schedule.intervalMinutes}m`,
});
}
// Start everything
await proxy.start();
if (config.topics.length > 0) {
scheduler.start();
log.info(`Scheduler started with ${config.topics.length} topics`);
} else {
log.info("No topics configured - proxy-only mode (add topics in config)");
}
log.info("WebProxy is running");
log.info(` Proxy: http://${config.server.host}:${config.server.port}`);
log.info(` Admin: http://${config.server.host}:${config.server.adminPort}/api/status`);
log.info(` Cache: ${config.cache.dir}`);
log.info(` WARC: ${config.warc.dir}`);
// Graceful shutdown
const shutdown = async (signal: string) => {
log.info(`Received ${signal}, shutting down...`);
scheduler.stop();
await proxy.stop();
process.exit(0);
};
process.on("SIGINT", () => shutdown("SIGINT"));
process.on("SIGTERM", () => shutdown("SIGTERM"));
}
main().catch((err) => {
log.error("Fatal error", { error: String(err) });
process.exit(1);
});

View File

@ -5,17 +5,27 @@
"description": "Local internet indexing layer - crawl, cache, and serve web content to your network", "description": "Local internet indexing layer - crawl, cache, and serve web content to your network",
"scripts": { "scripts": {
"dev": "pnpm --filter @webproxy/web dev", "dev": "pnpm --filter @webproxy/web dev",
"dev:proxy": "pnpm --filter @webproxy/proxy dev",
"build": "pnpm --filter @webproxy/web build", "build": "pnpm --filter @webproxy/web build",
"start": "pnpm --filter @webproxy/web start", "start": "pnpm --filter @webproxy/web start",
"start:proxy": "pnpm --filter @webproxy/proxy start",
"lint": "pnpm -r lint", "lint": "pnpm -r lint",
"clean": "pnpm -r clean", "clean": "pnpm -r clean",
"typecheck": "pnpm -r typecheck" "typecheck": "pnpm -r --no-bail typecheck"
}, },
"keywords": ["proxy", "web-indexer", "cache", "local-network"], "keywords": [
"proxy",
"web-indexer",
"cache",
"local-network"
],
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=20", "node": ">=20",
"pnpm": ">=9" "pnpm": ">=9"
}, },
"packageManager": "pnpm@10.24.0" "packageManager": "pnpm@10.24.0",
"devDependencies": {
"typescript": "^5.9.3"
}
} }

View File

@ -5,10 +5,18 @@
"main": "./src/index.ts", "main": "./src/index.ts",
"types": "./src/index.ts", "types": "./src/index.ts",
"dependencies": { "dependencies": {
"@webproxy/shared": "workspace:*" "@webproxy/shared": "workspace:*",
"http-proxy-3": "^1.23.2",
"lru-cache": "^11.2.6",
"warcio": "^2.4.10",
"zod": "^4.3.6"
}, },
"scripts": { "scripts": {
"typecheck": "tsc --noEmit", "typecheck": "tsc --noEmit",
"clean": "rm -rf dist" "clean": "rm -rf dist"
},
"devDependencies": {
"@types/node": "^25.3.2",
"typescript": "^5.9.3"
} }
} }

View File

@ -0,0 +1,212 @@
/**
* @file cache-store
* @description In-memory + disk cache for proxied content with LRU eviction
* @layer Core
*
* Manages cached HTTP responses. Uses LRU-cache for hot metadata lookups
* and filesystem for response bodies. Tracks cache hit/miss stats.
*/
import { LRUCache } from "lru-cache";
import { createHash } from "node:crypto";
import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync, readdirSync, statSync } from "node:fs";
import { join } from "node:path";
import { normalizeUrl, formatBytes, type CachedPage, type CacheStats } from "@webproxy/shared";
import { Logger } from "./logger.js";
type CacheEntry = {
meta: CachedPage;
bodyPath: string;
};
export class CacheStore {
private index: LRUCache<string, CacheEntry>;
private cacheDir: string;
private stats = { hits: 0, misses: 0 };
private log: Logger;
constructor(options: {
dir: string;
maxSizeBytes: number;
maxAge: number;
logger?: Logger;
}) {
this.cacheDir = options.dir;
this.log = options.logger ?? new Logger("CacheStore");
if (!existsSync(this.cacheDir)) {
mkdirSync(this.cacheDir, { recursive: true });
}
this.index = new LRUCache<string, CacheEntry>({
max: 50_000, // max entries in memory index
ttl: options.maxAge,
dispose: (_value, key) => {
this.deleteFromDisk(key);
},
});
this.loadIndex();
}
has(url: string): boolean {
const key = this.urlToKey(url);
const entry = this.index.get(key);
if (!entry) return false;
// Check if expired
if (entry.meta.expiresAt < new Date()) {
this.index.delete(key);
return false;
}
return true;
}
get(url: string): { meta: CachedPage; body: Buffer } | null {
const key = this.urlToKey(url);
const entry = this.index.get(key);
if (!entry) {
this.stats.misses++;
return null;
}
if (entry.meta.expiresAt < new Date()) {
this.index.delete(key);
this.stats.misses++;
return null;
}
try {
const body = readFileSync(entry.bodyPath);
this.stats.hits++;
return { meta: entry.meta, body };
} catch {
this.index.delete(key);
this.stats.misses++;
return null;
}
}
set(url: string, meta: Omit<CachedPage, "contentHash">, body: Buffer): void {
const key = this.urlToKey(url);
const contentHash = createHash("sha256").update(body).digest("hex");
const bodyPath = join(this.cacheDir, `${key}.bin`);
writeFileSync(bodyPath, body);
const fullMeta: CachedPage = { ...meta, contentHash };
this.index.set(key, { meta: fullMeta, bodyPath });
this.log.debug(`Cached ${formatBytes(body.length)}`, { url });
}
delete(url: string): boolean {
const key = this.urlToKey(url);
return this.index.delete(key);
}
getStats(): CacheStats {
const entries = [...this.index.values()];
const total = this.stats.hits + this.stats.misses;
let totalSize = 0;
let oldest: Date | null = null;
let newest: Date | null = null;
for (const entry of entries) {
totalSize += entry.meta.size;
if (!oldest || entry.meta.fetchedAt < oldest) oldest = entry.meta.fetchedAt;
if (!newest || entry.meta.fetchedAt > newest) newest = entry.meta.fetchedAt;
}
return {
totalPages: this.index.size,
totalSize,
oldestEntry: oldest,
newestEntry: newest,
hitRate: total > 0 ? this.stats.hits / total : 0,
missRate: total > 0 ? this.stats.misses / total : 0,
};
}
clear(): void {
this.index.clear();
this.stats = { hits: 0, misses: 0 };
this.log.info("Cache cleared");
}
private urlToKey(url: string): string {
const normalized = normalizeUrl(url);
return createHash("sha256").update(normalized).digest("hex").slice(0, 32);
}
private deleteFromDisk(key: string): void {
const bodyPath = join(this.cacheDir, `${key}.bin`);
try {
unlinkSync(bodyPath);
} catch {
// File may already be gone
}
}
private loadIndex(): void {
const metaDir = join(this.cacheDir, "meta");
if (!existsSync(metaDir)) {
mkdirSync(metaDir, { recursive: true });
return;
}
try {
const files = readdirSync(metaDir).filter((f) => f.endsWith(".json"));
let loaded = 0;
for (const file of files) {
try {
const raw = readFileSync(join(metaDir, file), "utf-8");
const entry: CacheEntry = JSON.parse(raw);
entry.meta.fetchedAt = new Date(entry.meta.fetchedAt);
entry.meta.expiresAt = new Date(entry.meta.expiresAt);
if (entry.meta.expiresAt > new Date() && existsSync(entry.bodyPath)) {
const key = file.replace(".json", "");
this.index.set(key, entry);
loaded++;
}
} catch {
// Skip corrupted entries
}
}
if (loaded > 0) {
this.log.info(`Loaded ${loaded} cached entries from disk`);
}
} catch {
// Fresh cache
}
}
persistIndex(): void {
const metaDir = join(this.cacheDir, "meta");
if (!existsSync(metaDir)) {
mkdirSync(metaDir, { recursive: true });
}
let saved = 0;
for (const [key, entry] of this.index.entries()) {
try {
writeFileSync(join(metaDir, `${key}.json`), JSON.stringify(entry));
saved++;
} catch {
// Skip
}
}
this.log.info(`Persisted ${saved} cache entries`);
}
get size(): number {
return this.index.size;
}
}

View File

@ -1,7 +1,11 @@
/** /**
* @file index * @file index
* @description Core proxy engine - HTTP handling and caching * @description Core proxy engine - HTTP proxy, caching, WARC storage
* @layer Core * @layer Core
*/ */
export { type ProxyConfig, type CachedPage } from "@webproxy/shared"; export { WebProxyServer } from "./proxy-server.js";
export { CacheStore } from "./cache-store.js";
export { WarcWriter } from "./warc-writer.js";
export { Logger } from "./logger.js";
export type { ProxyEvent, ProxyEventMap } from "./types.js";

View File

@ -0,0 +1,73 @@
/**
* @file logger
* @description Structured logger for the proxy
* @layer Core
*/
import type { LogLevel } from "@webproxy/shared";
const LEVEL_PRIORITY: Record<LogLevel, number> = {
debug: 0,
info: 1,
warn: 2,
error: 3,
};
const LEVEL_COLOR: Record<LogLevel, string> = {
debug: "\x1b[90m",
info: "\x1b[36m",
warn: "\x1b[33m",
error: "\x1b[31m",
};
const RESET = "\x1b[0m";
export class Logger {
private minLevel: number;
constructor(
private context: string,
level: LogLevel = "info",
) {
this.minLevel = LEVEL_PRIORITY[level];
}
private log(level: LogLevel, message: string, data?: Record<string, unknown>) {
if (LEVEL_PRIORITY[level] < this.minLevel) return;
const timestamp = new Date().toISOString();
const color = LEVEL_COLOR[level];
const prefix = `${color}[${timestamp}] [${level.toUpperCase()}] [${this.context}]${RESET}`;
if (data) {
console.log(`${prefix} ${message}`, data);
} else {
console.log(`${prefix} ${message}`);
}
}
debug(message: string, data?: Record<string, unknown>) {
this.log("debug", message, data);
}
info(message: string, data?: Record<string, unknown>) {
this.log("info", message, data);
}
warn(message: string, data?: Record<string, unknown>) {
this.log("warn", message, data);
}
error(message: string, data?: Record<string, unknown>) {
this.log("error", message, data);
}
child(context: string): Logger {
return new Logger(`${this.context}:${context}`, this.levelName());
}
private levelName(): LogLevel {
const entry = Object.entries(LEVEL_PRIORITY).find(([, v]) => v === this.minLevel);
return (entry?.[0] as LogLevel) ?? "info";
}
}

View File

@ -0,0 +1,375 @@
/**
* @file proxy-server
* @description Forward HTTP proxy server using http-proxy-3
* @layer Core
*
* Acts as a forward proxy for network devices. Intercepts HTTP requests,
* checks local cache, serves cached content or fetches from upstream.
* Archives responses to WARC and indexes content for search.
*/
import { createServer, Server, IncomingMessage, ServerResponse } from "node:http";
import { createProxyServer, type ProxyServer as HttpProxy } from "http-proxy-3";
import { EventEmitter } from "node:events";
import { createHash } from "node:crypto";
import { connect as netConnect, type Socket } from "node:net";
import { type ProxyConfig, formatBytes } from "@webproxy/shared";
import { CacheStore } from "./cache-store.js";
import { WarcWriter, type WarcRecord } from "./warc-writer.js";
import { Logger } from "./logger.js";
import type { ProxyEvent, ProxyEventMap } from "./types.js";
export class WebProxyServer extends EventEmitter<ProxyEventMap> {
private server: Server | null = null;
private proxy: HttpProxy;
private cache: CacheStore;
private warc: WarcWriter;
private log: Logger;
private config: ProxyConfig;
private adminServer: Server | null = null;
constructor(config: ProxyConfig) {
super();
this.config = config;
this.log = new Logger("Proxy", config.logging.level);
this.cache = new CacheStore({
dir: config.cache.dir,
maxSizeBytes: config.cache.maxSizeBytes,
maxAge: config.cache.maxAge,
logger: this.log.child("cache"),
});
this.warc = new WarcWriter({
dir: config.warc.dir,
maxFileSize: config.warc.maxFileSize,
compress: config.warc.compress,
logger: this.log.child("warc"),
});
this.proxy = createProxyServer({
changeOrigin: true,
selfHandleResponse: true,
proxyTimeout: config.proxy.timeout,
followRedirects: config.proxy.followRedirects,
});
this.setupProxyEvents();
}
async start(): Promise<void> {
const { host, port, adminPort } = this.config.server;
// Main proxy server
this.server = createServer((req, res) => this.handleRequest(req, res));
this.server.on("connect", (req: IncomingMessage, socket: Socket, head: Buffer) => {
this.handleConnect(req, socket, head);
});
await new Promise<void>((resolve) => {
this.server!.listen(port, host, () => {
this.log.info(`Proxy server listening on ${host}:${port}`);
resolve();
});
});
// Admin API server
this.adminServer = createServer((req, res) => this.handleAdminRequest(req, res));
await new Promise<void>((resolve) => {
this.adminServer!.listen(adminPort, host, () => {
this.log.info(`Admin API listening on ${host}:${adminPort}`);
resolve();
});
});
this.emit("start");
}
async stop(): Promise<void> {
this.log.info("Shutting down...");
// Persist cache index
this.cache.persistIndex();
// Close WARC writer
await this.warc.close();
// Close servers
await Promise.all([
new Promise<void>((resolve) => {
if (this.server) this.server.close(() => resolve());
else resolve();
}),
new Promise<void>((resolve) => {
if (this.adminServer) this.adminServer.close(() => resolve());
else resolve();
}),
]);
this.proxy.close();
this.emit("stop");
this.log.info("Shutdown complete");
}
private async handleRequest(req: IncomingMessage, res: ServerResponse): Promise<void> {
const url = req.url ?? "";
const method = req.method ?? "GET";
const startTime = Date.now();
// Only cache GET requests
if (method === "GET" && this.cache.has(url)) {
const cached = this.cache.get(url);
if (cached) {
const duration = Date.now() - startTime;
this.log.debug(`Cache HIT: ${url} (${formatBytes(cached.meta.size)})`);
// Write cached response
res.writeHead(cached.meta.statusCode, cached.meta.headers);
res.end(cached.body);
this.emitEvent({
url,
method,
statusCode: cached.meta.statusCode,
contentType: cached.meta.contentType,
size: cached.meta.size,
cached: true,
duration,
timestamp: new Date(),
});
this.emit("cacheHit", url, cached.meta.size);
return;
}
}
// Forward to upstream
try {
const target = this.resolveTarget(url);
if (!target) {
res.writeHead(400, { "Content-Type": "text/plain" });
res.end("Bad Request: invalid URL");
return;
}
this.proxy.web(req, res, { target });
} catch (err) {
this.log.error(`Proxy error: ${url}`, { error: String(err) });
this.emit("error", err instanceof Error ? err : new Error(String(err)), url);
if (!res.headersSent) {
res.writeHead(502, { "Content-Type": "text/plain" });
res.end("Bad Gateway");
}
}
}
private handleConnect(
req: IncomingMessage,
clientSocket: Socket,
_head: Buffer,
): void {
// HTTPS CONNECT method — tunnel the connection
const [hostname, port] = (req.url ?? "").split(":");
const serverPort = parseInt(port, 10) || 443;
this.log.debug(`CONNECT tunnel: ${hostname}:${serverPort}`);
const serverSocket = netConnect(serverPort, hostname, () => {
clientSocket.write(
"HTTP/1.1 200 Connection Established\r\nProxy-Agent: WebProxy\r\n\r\n",
);
serverSocket.pipe(clientSocket);
clientSocket.pipe(serverSocket);
});
serverSocket.on("error", (err) => {
this.log.error(`CONNECT error: ${hostname}:${serverPort}`, { error: err.message });
clientSocket.end("HTTP/1.1 502 Bad Gateway\r\n\r\n");
});
clientSocket.on("error", () => {
serverSocket.destroy();
});
}
private setupProxyEvents(): void {
// Capture the response from upstream
this.proxy.on("proxyRes", (proxyRes, req, res) => {
const url = req.url ?? "";
const method = req.method ?? "GET";
const statusCode = proxyRes.statusCode ?? 500;
const contentType = proxyRes.headers["content-type"] ?? "application/octet-stream";
const startTime = Date.now();
const chunks: Buffer[] = [];
proxyRes.on("data", (chunk: Buffer) => {
chunks.push(chunk);
});
proxyRes.on("end", () => {
const body = Buffer.concat(chunks);
const duration = Date.now() - startTime;
// Forward response to client
const headers: Record<string, string | string[] | undefined> = {};
for (const [key, value] of Object.entries(proxyRes.headers)) {
headers[key] = value;
}
res.writeHead(statusCode, headers);
res.end(body);
// Cache successful GET responses
if (method === "GET" && statusCode >= 200 && statusCode < 400) {
const maxAge = this.parseMaxAge(proxyRes.headers["cache-control"]);
const expiresAt = new Date(Date.now() + (maxAge ?? this.config.cache.maxAge));
const flatHeaders: Record<string, string> = {};
for (const [k, v] of Object.entries(proxyRes.headers)) {
if (typeof v === "string") flatHeaders[k] = v;
else if (Array.isArray(v)) flatHeaders[k] = v.join(", ");
}
this.cache.set(url, {
url,
fetchedAt: new Date(),
expiresAt,
contentType,
statusCode,
headers: flatHeaders,
size: body.length,
}, body);
this.emit("cacheStore", url, body.length);
// Write WARC record in background
const warcRecord: WarcRecord = {
url,
method,
requestHeaders: this.extractHeaders(req),
statusCode,
responseHeaders: flatHeaders,
body,
timestamp: new Date(),
};
this.warc.write(warcRecord).catch((err) => {
this.log.error("WARC write failed", { error: String(err), url });
});
this.emit("cacheMiss", url);
}
this.emitEvent({
url,
method,
statusCode,
contentType,
size: body.length,
cached: false,
duration,
timestamp: new Date(),
});
});
});
this.proxy.on("error", (err, req, res) => {
const url = (req as IncomingMessage).url ?? "unknown";
this.log.error(`Upstream error: ${url}`, { error: String(err) });
this.emit("error", err instanceof Error ? err : new Error(String(err)), url);
if (res instanceof ServerResponse && !res.headersSent) {
res.writeHead(502, { "Content-Type": "text/plain" });
res.end("Bad Gateway: upstream server error");
}
});
}
private async handleAdminRequest(req: IncomingMessage, res: ServerResponse): Promise<void> {
const url = new URL(req.url ?? "/", `http://${req.headers.host}`);
const path = url.pathname;
// CORS
res.setHeader("Access-Control-Allow-Origin", "*");
res.setHeader("Access-Control-Allow-Methods", "GET, POST, DELETE, OPTIONS");
res.setHeader("Access-Control-Allow-Headers", "Content-Type");
if (req.method === "OPTIONS") {
res.writeHead(204);
res.end();
return;
}
try {
if (path === "/api/status" && req.method === "GET") {
const stats = this.cache.getStats();
this.sendJson(res, {
status: "running",
cache: stats,
warc: { totalRecords: this.warc.totalRecords },
uptime: process.uptime(),
});
} else if (path === "/api/cache/stats" && req.method === "GET") {
this.sendJson(res, this.cache.getStats());
} else if (path === "/api/cache/clear" && req.method === "POST") {
this.cache.clear();
this.sendJson(res, { cleared: true });
} else if (path === "/api/config" && req.method === "GET") {
this.sendJson(res, {
server: this.config.server,
cache: { dir: this.config.cache.dir, maxSizeBytes: this.config.cache.maxSizeBytes },
topics: this.config.topics.length,
});
} else {
res.writeHead(404, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: "Not Found" }));
}
} catch (err) {
res.writeHead(500, { "Content-Type": "application/json" });
res.end(JSON.stringify({ error: String(err) }));
}
}
private sendJson(res: ServerResponse, data: unknown): void {
res.writeHead(200, { "Content-Type": "application/json" });
res.end(JSON.stringify(data, null, 2));
}
private resolveTarget(url: string): string | null {
try {
const parsed = new URL(url);
return `${parsed.protocol}//${parsed.host}`;
} catch {
return null;
}
}
private parseMaxAge(cacheControl?: string): number | null {
if (!cacheControl) return null;
const match = cacheControl.match(/max-age=(\d+)/);
if (!match) return null;
return parseInt(match[1], 10) * 1000;
}
private extractHeaders(req: IncomingMessage): Record<string, string> {
const headers: Record<string, string> = {};
for (const [key, value] of Object.entries(req.headers)) {
if (typeof value === "string") headers[key] = value;
else if (Array.isArray(value)) headers[key] = value.join(", ");
}
return headers;
}
private emitEvent(event: ProxyEvent): void {
this.emit("request", event);
}
get cacheStore(): CacheStore {
return this.cache;
}
get warcWriter(): WarcWriter {
return this.warc;
}
}

View File

@ -0,0 +1,26 @@
/**
* @file types
* @description Core-specific types
* @layer Core
*/
export type ProxyEvent = {
url: string;
method: string;
statusCode: number;
contentType: string;
size: number;
cached: boolean;
duration: number;
timestamp: Date;
};
export type ProxyEventMap = {
request: [event: ProxyEvent];
cacheHit: [url: string, size: number];
cacheMiss: [url: string];
cacheStore: [url: string, size: number];
error: [error: Error, url?: string];
start: [];
stop: [];
};

View File

@ -0,0 +1,142 @@
/**
* @file warc-writer
* @description WARC file writer for archiving proxied content
* @layer Core
*
* Writes HTTP request/response pairs to WARC files using the warcio.js library.
* WARC (Web ARChive) is the standard format for web archiving.
*/
import { createWriteStream, existsSync, mkdirSync, statSync, WriteStream } from "node:fs";
import { join } from "node:path";
import { Logger } from "./logger.js";
export type WarcRecord = {
url: string;
method: string;
requestHeaders: Record<string, string>;
statusCode: number;
responseHeaders: Record<string, string>;
body: Buffer;
timestamp: Date;
};
export class WarcWriter {
private warcDir: string;
private maxFileSize: number;
private currentFile: WriteStream | null = null;
private currentFilePath = "";
private currentSize = 0;
private fileIndex = 0;
private log: Logger;
private recordCount = 0;
constructor(options: {
dir: string;
maxFileSize: number;
compress?: boolean;
logger?: Logger;
}) {
this.warcDir = options.dir;
this.maxFileSize = options.maxFileSize;
this.log = options.logger ?? new Logger("WarcWriter");
if (!existsSync(this.warcDir)) {
mkdirSync(this.warcDir, { recursive: true });
}
}
async write(record: WarcRecord): Promise<string> {
await this.ensureFile();
const warcRecord = this.formatWarcRecord(record);
const data = Buffer.from(warcRecord, "utf-8");
return new Promise((resolve, reject) => {
if (!this.currentFile) {
reject(new Error("No WARC file open"));
return;
}
this.currentFile.write(data, (err) => {
if (err) {
reject(err);
return;
}
this.currentSize += data.length;
this.recordCount++;
this.log.debug(`WARC record written`, { url: record.url, size: data.length });
resolve(this.currentFilePath);
});
});
}
private formatWarcRecord(record: WarcRecord): string {
const warcId = `<urn:uuid:${crypto.randomUUID()}>`;
const warcDate = record.timestamp.toISOString();
// Format request headers
const reqHeaderLines = Object.entries(record.requestHeaders)
.map(([k, v]) => `${k}: ${v}`)
.join("\r\n");
const requestBlock = `${record.method} ${new URL(record.url).pathname} HTTP/1.1\r\n${reqHeaderLines}\r\n\r\n`;
// Format response headers
const resHeaderLines = Object.entries(record.responseHeaders)
.map(([k, v]) => `${k}: ${v}`)
.join("\r\n");
const statusLine = `HTTP/1.1 ${record.statusCode} OK`;
const responseBlock = `${statusLine}\r\n${resHeaderLines}\r\n\r\n`;
const responsePayload = responseBlock + record.body.toString("utf-8");
// WARC response record
const warcHeaders = [
`WARC/1.1`,
`WARC-Type: response`,
`WARC-Date: ${warcDate}`,
`WARC-Target-URI: ${record.url}`,
`WARC-Record-ID: ${warcId}`,
`Content-Type: application/http;msgtype=response`,
`Content-Length: ${Buffer.byteLength(responsePayload, "utf-8")}`,
].join("\r\n");
return `${warcHeaders}\r\n\r\n${responsePayload}\r\n\r\n`;
}
private async ensureFile(): Promise<void> {
if (this.currentFile && this.currentSize < this.maxFileSize) {
return;
}
await this.rotateFile();
}
private async rotateFile(): Promise<void> {
if (this.currentFile) {
await new Promise<void>((resolve) => {
this.currentFile!.end(() => resolve());
});
}
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
this.currentFilePath = join(this.warcDir, `webproxy-${timestamp}-${this.fileIndex}.warc`);
this.currentFile = createWriteStream(this.currentFilePath, { flags: "a" });
this.currentSize = 0;
this.fileIndex++;
this.log.info(`New WARC file: ${this.currentFilePath}`);
}
async close(): Promise<void> {
if (this.currentFile) {
await new Promise<void>((resolve) => {
this.currentFile!.end(() => resolve());
});
this.currentFile = null;
}
this.log.info(`WARC writer closed. Total records: ${this.recordCount}`);
}
get totalRecords(): number {
return this.recordCount;
}
}

View File

@ -5,10 +5,24 @@
"main": "./src/index.ts", "main": "./src/index.ts",
"types": "./src/index.ts", "types": "./src/index.ts",
"dependencies": { "dependencies": {
"@webproxy/shared": "workspace:*" "@crawlee/cheerio": "^3.16.0",
"@crawlee/http": "^3.16.0",
"@mozilla/readability": "^0.6.0",
"@webproxy/shared": "workspace:*",
"cheerio": "^1.2.0",
"crawlee": "^3.16.0",
"meilisearch": "^0.55.0",
"turndown": "^7.2.2",
"zod": "^4.3.6"
}, },
"scripts": { "scripts": {
"typecheck": "tsc --noEmit", "typecheck": "tsc --noEmit",
"clean": "rm -rf dist" "clean": "rm -rf dist"
},
"devDependencies": {
"@types/node": "^25.3.2",
"@types/turndown": "^5.0.6",
"jsdom": "^28.1.0",
"typescript": "^5.9.3"
} }
} }

View File

@ -0,0 +1,208 @@
/**
* @file crawler
* @description Topic-based web crawler using Crawlee's CheerioCrawler
* @layer Indexer
*
* Crawls seed URLs for configured topics, extracts content,
* indexes in search engine, and stores in cache/WARC.
*/
import { CheerioCrawler, type CheerioCrawlingContext, Configuration } from "crawlee";
import { type TopicConfig, type CrawlJob, type CrawlResult, createId } from "@webproxy/shared";
import { ContentExtractor, type ExtractedContent } from "./extractor.js";
import { SearchClient, type SearchDocument } from "./search-client.js";
export type CrawlCallbacks = {
onPageCrawled?: (result: CrawlResult, extracted: ExtractedContent | null) => void | Promise<void>;
onJobComplete?: (job: CrawlJob) => void;
onError?: (error: Error, url: string) => void;
};
export class TopicCrawler {
private extractor: ContentExtractor;
private search: SearchClient | null;
private activeCrawlers = new Map<string, CheerioCrawler>();
constructor(options: {
search?: SearchClient;
} = {}) {
this.extractor = new ContentExtractor();
this.search = options.search ?? null;
}
async crawlTopic(topic: TopicConfig, callbacks?: CrawlCallbacks): Promise<CrawlJob> {
const job: CrawlJob = {
id: createId("crawl"),
topicId: topic.id,
status: "running",
startedAt: new Date(),
pagesProcessed: 0,
pagesFailed: 0,
bytesDownloaded: 0,
};
if (topic.seedUrls.length === 0) {
job.status = "completed";
job.completedAt = new Date();
return job;
}
// Configure Crawlee to use a local storage dir
const config = Configuration.getGlobalConfig();
config.set("persistStorage", false);
const crawler = new CheerioCrawler({
maxRequestsPerCrawl: topic.schedule.maxPagesPerCrawl,
maxConcurrency: 5,
requestHandlerTimeoutSecs: 30,
maxRequestRetries: 2,
async requestHandler(context: CheerioCrawlingContext) {
const { request, $, body, response } = context;
const url = request.loadedUrl ?? request.url;
const statusCode = response?.statusCode ?? 200;
const contentType = response?.headers?.["content-type"] ?? "text/html";
const rawHtml = typeof body === "string" ? body : body.toString();
// Extract content
const extracted = self.extractor.extract(rawHtml, url);
const result: CrawlResult = {
url,
statusCode,
contentType,
content: extracted?.textContent ?? $("body").text().trim(),
rawHtml,
title: extracted?.title ?? $("title").text().trim() ?? url,
excerpt: extracted?.excerpt ?? "",
byline: extracted?.byline ?? null,
fetchedAt: new Date(),
headers: (response?.headers as Record<string, string>) ?? {},
size: Buffer.byteLength(rawHtml, "utf-8"),
};
job.pagesProcessed++;
job.bytesDownloaded += result.size;
// Index in search engine
if (self.search && extracted) {
const doc = self.extractor.toSearchDocument(url, extracted, topic.id);
await self.search.addDocument(doc);
}
// Callback
if (callbacks?.onPageCrawled) {
await callbacks.onPageCrawled(result, extracted);
}
// Enqueue discovered links (respect depth and domain restrictions)
const depth = (request.userData?.depth ?? 0) as number;
if (depth < topic.schedule.maxDepth) {
const links: string[] = [];
$("a[href]").each((_i, el) => {
const href = $(el).attr("href");
if (href) {
try {
const resolved = new URL(href, url).toString();
if (self.shouldFollowUrl(resolved, topic)) {
links.push(resolved);
}
} catch {
// Invalid URL, skip
}
}
});
if (links.length > 0) {
await context.addRequests(
links.slice(0, 50).map((link) => ({
url: link,
userData: { depth: depth + 1, topicId: topic.id },
})),
);
}
}
},
failedRequestHandler({ request }, error) {
job.pagesFailed++;
callbacks?.onError?.(error as Error, request.url);
},
});
// Capture `this` for use in the handler
const self = this;
this.activeCrawlers.set(topic.id, crawler);
try {
await crawler.run(
topic.seedUrls.map((url) => ({
url,
userData: { depth: 0, topicId: topic.id },
})),
);
job.status = "completed";
job.completedAt = new Date();
} catch (err) {
job.status = "failed";
job.error = String(err);
job.completedAt = new Date();
} finally {
this.activeCrawlers.delete(topic.id);
callbacks?.onJobComplete?.(job);
}
return job;
}
private shouldFollowUrl(url: string, topic: TopicConfig): boolean {
try {
const parsed = new URL(url);
// Skip non-http
if (!parsed.protocol.startsWith("http")) return false;
// Skip common non-content paths
const skip = ["/login", "/signup", "/register", "/cart", "/checkout", "/api/"];
if (skip.some((s) => parsed.pathname.startsWith(s))) return false;
// Skip file extensions we don't want
const skipExts = [".pdf", ".zip", ".tar", ".gz", ".jpg", ".png", ".gif", ".mp4", ".mp3"];
if (skipExts.some((ext) => parsed.pathname.endsWith(ext))) return false;
// Domain restrictions
if (topic.allowedDomains.length > 0) {
const allowed = topic.allowedDomains.some(
(d) => parsed.hostname === d || parsed.hostname.endsWith(`.${d}`),
);
if (!allowed) return false;
}
if (topic.blockedDomains.length > 0) {
const blocked = topic.blockedDomains.some(
(d) => parsed.hostname === d || parsed.hostname.endsWith(`.${d}`),
);
if (blocked) return false;
}
return true;
} catch {
return false;
}
}
cancelCrawl(topicId: string): boolean {
const crawler = this.activeCrawlers.get(topicId);
if (!crawler) return false;
// Crawlee doesn't have a direct cancel, but we can stop adding requests
this.activeCrawlers.delete(topicId);
return true;
}
get activeTopics(): string[] {
return [...this.activeCrawlers.keys()];
}
}

View File

@ -0,0 +1,108 @@
/**
* @file extractor
* @description Content extraction using Readability + Turndown for clean text/markdown
* @layer Indexer
*
* Takes raw HTML and extracts readable article content,
* then optionally converts to Markdown for AI/search indexing.
*/
import { Readability } from "@mozilla/readability";
import TurndownService from "turndown";
import { JSDOM } from "jsdom";
import type { CrawlResult } from "@webproxy/shared";
export type ExtractedContent = {
title: string;
content: string;
textContent: string;
markdown: string;
excerpt: string;
byline: string | null;
length: number;
siteName: string | null;
};
export class ContentExtractor {
private turndown: TurndownService;
constructor() {
this.turndown = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
bulletListMarker: "-",
});
// Remove script, style, nav elements
this.turndown.remove(["script", "style", "nav", "footer", "header"]);
}
extract(html: string, url: string): ExtractedContent | null {
try {
const dom = new JSDOM(html, { url });
const doc = dom.window.document;
const reader = new Readability(doc);
const article = reader.parse();
if (!article || !article.content) {
return this.fallbackExtract(html, url, dom);
}
const markdown = this.turndown.turndown(article.content);
return {
title: article.title ?? "",
content: article.content,
textContent: article.textContent ?? "",
markdown,
excerpt: article.excerpt ?? "",
byline: article.byline ?? null,
length: article.length ?? 0,
siteName: article.siteName ?? null,
};
} catch {
return null;
}
}
private fallbackExtract(html: string, url: string, dom: JSDOM): ExtractedContent {
const doc = dom.window.document;
const title = doc.title || new URL(url).hostname;
const body = doc.body?.textContent ?? "";
const truncated = body.slice(0, 5000).trim();
return {
title,
content: doc.body?.innerHTML ?? "",
textContent: truncated,
markdown: this.turndown.turndown(doc.body?.innerHTML ?? ""),
excerpt: truncated.slice(0, 200),
byline: null,
length: truncated.length,
siteName: null,
};
}
toSearchDocument(url: string, extracted: ExtractedContent, topicId?: string) {
const domain = new URL(url).hostname;
return {
id: this.urlToId(url),
url,
title: extracted.title,
content: extracted.textContent.slice(0, 50_000), // Limit for search indexing
excerpt: extracted.excerpt,
markdown: extracted.markdown.slice(0, 50_000),
domain,
topicId: topicId ?? null,
fetchedAt: new Date().toISOString(),
byline: extracted.byline,
siteName: extracted.siteName,
};
}
private urlToId(url: string): string {
const { createHash } = require("node:crypto");
return createHash("sha256").update(url).digest("hex").slice(0, 24);
}
}

View File

@ -1,7 +1,10 @@
/** /**
* @file index * @file index
* @description Web crawling and indexing engine * @description Web crawling, content extraction, and search indexing engine
* @layer Indexer * @layer Indexer
*/ */
export { type TopicConfig } from "@webproxy/shared"; export { TopicCrawler } from "./crawler.js";
export { ContentExtractor } from "./extractor.js";
export { SearchClient } from "./search-client.js";
export { CrawlScheduler } from "./scheduler.js";

View File

@ -0,0 +1,184 @@
/**
* @file scheduler
* @description Crawl scheduler that runs topic crawls on configured intervals
* @layer Indexer
*
* Manages scheduled crawl jobs for configured topics.
* Runs crawls at the interval specified in each topic's schedule.
*/
import { type TopicConfig, type CrawlJob } from "@webproxy/shared";
import { TopicCrawler, type CrawlCallbacks } from "./crawler.js";
type ScheduledTopic = {
topic: TopicConfig;
timer: ReturnType<typeof setInterval> | null;
lastRun: Date | null;
lastJob: CrawlJob | null;
running: boolean;
};
export class CrawlScheduler {
private crawler: TopicCrawler;
private topics = new Map<string, ScheduledTopic>();
private callbacks?: CrawlCallbacks;
private running = false;
constructor(crawler: TopicCrawler, callbacks?: CrawlCallbacks) {
this.crawler = crawler;
this.callbacks = callbacks;
}
addTopic(topic: TopicConfig): void {
if (this.topics.has(topic.id)) {
this.removeTopic(topic.id);
}
this.topics.set(topic.id, {
topic,
timer: null,
lastRun: null,
lastJob: null,
running: false,
});
if (this.running && topic.enabled) {
this.scheduleOne(topic.id);
}
}
removeTopic(topicId: string): void {
const scheduled = this.topics.get(topicId);
if (scheduled?.timer) {
clearInterval(scheduled.timer);
}
this.topics.delete(topicId);
}
start(): void {
this.running = true;
for (const [id, scheduled] of this.topics) {
if (scheduled.topic.enabled) {
this.scheduleOne(id);
}
}
console.log(`[Scheduler] Started with ${this.topics.size} topics`);
}
stop(): void {
this.running = false;
for (const [, scheduled] of this.topics) {
if (scheduled.timer) {
clearInterval(scheduled.timer);
scheduled.timer = null;
}
}
console.log("[Scheduler] Stopped");
}
async runNow(topicId: string): Promise<CrawlJob | null> {
const scheduled = this.topics.get(topicId);
if (!scheduled) return null;
return this.executeCrawl(scheduled);
}
async runAll(): Promise<CrawlJob[]> {
const jobs: CrawlJob[] = [];
for (const [, scheduled] of this.topics) {
if (scheduled.topic.enabled && !scheduled.running) {
const job = await this.executeCrawl(scheduled);
if (job) jobs.push(job);
}
}
return jobs;
}
private scheduleOne(topicId: string): void {
const scheduled = this.topics.get(topicId);
if (!scheduled) return;
const intervalMs = scheduled.topic.schedule.intervalMinutes * 60 * 1000;
// Run immediately on first schedule
this.executeCrawl(scheduled).catch((err) => {
console.error(`[Scheduler] Initial crawl failed for ${topicId}:`, err);
});
// Then on interval
scheduled.timer = setInterval(() => {
if (!scheduled.running) {
this.executeCrawl(scheduled).catch((err) => {
console.error(`[Scheduler] Scheduled crawl failed for ${topicId}:`, err);
});
}
}, intervalMs);
}
private async executeCrawl(scheduled: ScheduledTopic): Promise<CrawlJob | null> {
if (scheduled.running) return null;
scheduled.running = true;
scheduled.lastRun = new Date();
console.log(`[Scheduler] Starting crawl: ${scheduled.topic.name}`);
try {
const job = await this.crawler.crawlTopic(scheduled.topic, {
...this.callbacks,
onJobComplete: (job) => {
scheduled.lastJob = job;
scheduled.running = false;
this.callbacks?.onJobComplete?.(job);
console.log(
`[Scheduler] Crawl complete: ${scheduled.topic.name} - ${job.pagesProcessed} pages, ${job.pagesFailed} failed`,
);
},
});
return job;
} catch (err) {
scheduled.running = false;
console.error(`[Scheduler] Crawl error: ${scheduled.topic.name}`, err);
return null;
}
}
getStatus() {
const topicStatuses = [];
for (const [id, scheduled] of this.topics) {
topicStatuses.push({
id,
name: scheduled.topic.name,
enabled: scheduled.topic.enabled,
running: scheduled.running,
lastRun: scheduled.lastRun?.toISOString() ?? null,
lastJob: scheduled.lastJob
? {
status: scheduled.lastJob.status,
pagesProcessed: scheduled.lastJob.pagesProcessed,
pagesFailed: scheduled.lastJob.pagesFailed,
}
: null,
nextRun: scheduled.timer && scheduled.lastRun
? new Date(
scheduled.lastRun.getTime() +
scheduled.topic.schedule.intervalMinutes * 60 * 1000,
).toISOString()
: null,
});
}
return {
running: this.running,
topics: topicStatuses,
};
}
}

View File

@ -0,0 +1,141 @@
/**
* @file search-client
* @description MeiliSearch client for full-text search of indexed content
* @layer Indexer
*
* Wraps the MeiliSearch client for indexing crawled content
* and searching across the local cache.
*/
import { MeiliSearch, type Index } from "meilisearch";
import type { SearchQuery, SearchResult } from "@webproxy/shared";
export type SearchDocument = {
id: string;
url: string;
title: string;
content: string;
excerpt: string;
markdown: string;
domain: string;
topicId: string | null;
fetchedAt: string;
byline: string | null;
siteName: string | null;
};
export class SearchClient {
private client: MeiliSearch;
private indexName: string;
private index: Index<SearchDocument> | null = null;
private available = false;
constructor(options: { host: string; apiKey: string; indexName: string }) {
this.client = new MeiliSearch({
host: options.host,
apiKey: options.apiKey || undefined,
});
this.indexName = options.indexName;
}
async init(): Promise<boolean> {
try {
await this.client.health();
this.available = true;
this.index = this.client.index<SearchDocument>(this.indexName);
// Configure searchable and filterable attributes
await this.index.updateSettings({
searchableAttributes: ["title", "content", "excerpt", "url", "domain"],
filterableAttributes: ["domain", "topicId", "fetchedAt"],
sortableAttributes: ["fetchedAt"],
displayedAttributes: ["id", "url", "title", "excerpt", "domain", "topicId", "fetchedAt"],
});
return true;
} catch {
this.available = false;
return false;
}
}
async addDocuments(documents: SearchDocument[]): Promise<void> {
if (!this.available || !this.index) return;
try {
await this.index.addDocuments(documents);
} catch (err) {
console.error("[SearchClient] Failed to add documents:", err);
}
}
async addDocument(document: SearchDocument): Promise<void> {
return this.addDocuments([document]);
}
async search(query: SearchQuery): Promise<SearchResult[]> {
if (!this.available || !this.index) {
return [];
}
try {
const filters: string[] = [];
if (query.topicId) filters.push(`topicId = "${query.topicId}"`);
if (query.domain) filters.push(`domain = "${query.domain}"`);
const results = await this.index.search(query.query, {
limit: query.limit,
offset: query.offset,
filter: filters.length > 0 ? filters.join(" AND ") : undefined,
});
return results.hits.map((hit) => ({
url: hit.url,
title: hit.title,
excerpt: hit.excerpt,
content: hit.content ?? "",
domain: hit.domain,
topicId: hit.topicId,
fetchedAt: new Date(hit.fetchedAt),
score: 0, // MeiliSearch doesn't expose raw scores
}));
} catch (err) {
console.error("[SearchClient] Search failed:", err);
return [];
}
}
async deleteByUrl(url: string): Promise<void> {
if (!this.available || !this.index) return;
const { createHash } = require("node:crypto");
const id = createHash("sha256").update(url).digest("hex").slice(0, 24);
try {
await this.index.deleteDocument(id);
} catch {
// Ignore
}
}
async getStats() {
if (!this.available || !this.index) {
return { available: false, documents: 0 };
}
try {
const stats = await this.index.getStats();
return {
available: true,
documents: stats.numberOfDocuments,
isIndexing: stats.isIndexing,
};
} catch {
return { available: false, documents: 0 };
}
}
get isAvailable(): boolean {
return this.available;
}
}

View File

@ -7,5 +7,12 @@
"scripts": { "scripts": {
"typecheck": "tsc --noEmit", "typecheck": "tsc --noEmit",
"clean": "rm -rf dist" "clean": "rm -rf dist"
},
"dependencies": {
"zod": "^4.3.6"
},
"devDependencies": {
"@types/node": "^25.3.2",
"typescript": "^5.9.3"
} }
} }

View File

@ -0,0 +1,23 @@
/**
* @file config
* @description Configuration loader
* @layer Shared
*/
import { readFileSync, existsSync } from "node:fs";
import { resolve } from "node:path";
import { ProxyConfigSchema, type ProxyConfig } from "./types/config.js";
export const defaultConfig: ProxyConfig = ProxyConfigSchema.parse({});
export function loadConfig(configPath?: string): ProxyConfig {
const path = configPath ?? resolve(process.cwd(), "webproxy.config.json");
if (!existsSync(path)) {
return defaultConfig;
}
const raw = readFileSync(path, "utf-8");
const json = JSON.parse(raw);
return ProxyConfigSchema.parse(json);
}

View File

@ -1,31 +1,26 @@
/** /**
* @file index * @file index
* @description Shared types and utilities for the webproxy monorepo * @description Shared types, schemas, and utilities for the webproxy monorepo
* @layer Shared * @layer Shared
*/ */
export type TopicConfig = { export { TopicConfigSchema, CrawlScheduleSchema } from "./types/topic.js";
id: string; export type { TopicConfig, CrawlSchedule } from "./types/topic.js";
name: string;
keywords: string[];
urls: string[];
crawlInterval: number; // minutes
enabled: boolean;
};
export type CachedPage = { export { CachedPageSchema } from "./types/cache.js";
url: string; export type { CachedPage, CacheStats } from "./types/cache.js";
contentHash: string;
fetchedAt: Date;
expiresAt: Date;
contentType: string;
size: number;
};
export type ProxyConfig = { export { ProxyConfigSchema, ServerConfigSchema } from "./types/config.js";
port: number; export type { ProxyConfig, ServerConfig } from "./types/config.js";
hostname: string;
cacheDir: string; export { CrawlJobSchema } from "./types/crawl.js";
maxCacheSize: number; // bytes export type { CrawlJob, CrawlJobStatus, CrawlResult } from "./types/crawl.js";
topics: TopicConfig[];
}; export { SearchQuerySchema } from "./types/search.js";
export type { SearchQuery, SearchResult } from "./types/search.js";
export type { LogEntry } from "./types/log.js";
export { type LogLevel } from "./types/log.js";
export { createId, formatBytes, normalizeUrl, isValidUrl, sleep } from "./utils.js";
export { loadConfig, defaultConfig } from "./config.js";

View File

@ -0,0 +1,31 @@
/**
* @file cache
* @description Cached page types and schemas
* @layer Shared
*/
import { z } from "zod";
export const CachedPageSchema = z.object({
url: z.string().url(),
contentHash: z.string(),
fetchedAt: z.coerce.date(),
expiresAt: z.coerce.date(),
contentType: z.string(),
statusCode: z.number(),
headers: z.record(z.string(), z.string()).default(() => ({})),
size: z.number(),
warcFile: z.string().optional(),
topicId: z.string().optional(),
});
export type CachedPage = z.infer<typeof CachedPageSchema>;
export type CacheStats = {
totalPages: number;
totalSize: number;
oldestEntry: Date | null;
newestEntry: Date | null;
hitRate: number;
missRate: number;
};

View File

@ -0,0 +1,73 @@
/**
* @file config
* @description Application configuration types and schemas
* @layer Shared
*/
import { z } from "zod";
import { TopicConfigSchema } from "./topic.js";
export const ServerConfigSchema = z.object({
host: z.string().default("0.0.0.0"),
port: z.number().min(1).max(65535).default(8080),
adminPort: z.number().min(1).max(65535).default(8081),
});
export type ServerConfig = z.infer<typeof ServerConfigSchema>;
export const ProxyConfigSchema = z.object({
server: ServerConfigSchema.default(() => ({
host: "0.0.0.0",
port: 8080,
adminPort: 8081,
})),
cache: z.object({
dir: z.string().default("./data/cache"),
maxSizeBytes: z.number().default(10 * 1024 * 1024 * 1024),
maxAge: z.number().default(24 * 60 * 60 * 1000),
cleanupIntervalMs: z.number().default(60 * 60 * 1000),
}).default(() => ({
dir: "./data/cache",
maxSizeBytes: 10 * 1024 * 1024 * 1024,
maxAge: 24 * 60 * 60 * 1000,
cleanupIntervalMs: 60 * 60 * 1000,
})),
warc: z.object({
dir: z.string().default("./data/warc"),
maxFileSize: z.number().default(1024 * 1024 * 1024),
compress: z.boolean().default(true),
}).default(() => ({
dir: "./data/warc",
maxFileSize: 1024 * 1024 * 1024,
compress: true,
})),
search: z.object({
host: z.string().default("http://localhost:7700"),
apiKey: z.string().default(""),
indexName: z.string().default("webproxy-pages"),
}).default(() => ({
host: "http://localhost:7700",
apiKey: "",
indexName: "webproxy-pages",
})),
topics: z.array(TopicConfigSchema).default(() => []),
proxy: z.object({
timeout: z.number().default(30000),
followRedirects: z.boolean().default(true),
maxRedirects: z.number().default(5),
allowedPorts: z.array(z.number()).default(() => [80, 443, 8080, 8443]),
}).default(() => ({
timeout: 30000,
followRedirects: true,
maxRedirects: 5,
allowedPorts: [80, 443, 8080, 8443],
})),
logging: z.object({
level: z.enum(["debug", "info", "warn", "error"]).default("info"),
file: z.string().optional(),
}).default(() => ({
level: "info" as const,
})),
});
export type ProxyConfig = z.infer<typeof ProxyConfigSchema>;

View File

@ -0,0 +1,37 @@
/**
* @file crawl
* @description Crawl job types and schemas
* @layer Shared
*/
import { z } from "zod";
export type CrawlJobStatus = "pending" | "running" | "completed" | "failed" | "cancelled";
export const CrawlJobSchema = z.object({
id: z.string(),
topicId: z.string(),
status: z.enum(["pending", "running", "completed", "failed", "cancelled"]).default("pending"),
startedAt: z.coerce.date().optional(),
completedAt: z.coerce.date().optional(),
pagesProcessed: z.number().default(0),
pagesFailed: z.number().default(0),
bytesDownloaded: z.number().default(0),
error: z.string().optional(),
});
export type CrawlJob = z.infer<typeof CrawlJobSchema>;
export type CrawlResult = {
url: string;
statusCode: number;
contentType: string;
content: string;
rawHtml: string;
title: string;
excerpt: string;
byline: string | null;
fetchedAt: Date;
headers: Record<string, string>;
size: number;
};

View File

@ -0,0 +1,15 @@
/**
* @file log
* @description Logging types
* @layer Shared
*/
export type LogLevel = "debug" | "info" | "warn" | "error";
export type LogEntry = {
level: LogLevel;
message: string;
timestamp: Date;
context?: string;
data?: Record<string, unknown>;
};

View File

@ -0,0 +1,30 @@
/**
* @file search
* @description Search types and schemas
* @layer Shared
*/
import { z } from "zod";
export const SearchQuerySchema = z.object({
query: z.string().min(1),
limit: z.number().min(1).max(100).default(20),
offset: z.number().min(0).default(0),
topicId: z.string().optional(),
domain: z.string().optional(),
dateFrom: z.coerce.date().optional(),
dateTo: z.coerce.date().optional(),
});
export type SearchQuery = z.infer<typeof SearchQuerySchema>;
export type SearchResult = {
url: string;
title: string;
excerpt: string;
content: string;
domain: string;
topicId: string | null;
fetchedAt: Date;
score: number;
};

View File

@ -0,0 +1,38 @@
/**
* @file topic
* @description Topic configuration types and schemas
* @layer Shared
*/
import { z } from "zod";
export const CrawlScheduleSchema = z.object({
intervalMinutes: z.number().min(1).default(60),
maxPagesPerCrawl: z.number().min(1).default(100),
maxDepth: z.number().min(0).default(3),
respectRobotsTxt: z.boolean().default(true),
userAgent: z.string().default("WebProxy/0.1"),
});
export type CrawlSchedule = z.infer<typeof CrawlScheduleSchema>;
export const TopicConfigSchema = z.object({
id: z.string(),
name: z.string().min(1),
keywords: z.array(z.string()).default(() => []),
seedUrls: z.array(z.string().url()).default(() => []),
allowedDomains: z.array(z.string()).default(() => []),
blockedDomains: z.array(z.string()).default(() => []),
schedule: CrawlScheduleSchema.default(() => ({
intervalMinutes: 60,
maxPagesPerCrawl: 100,
maxDepth: 3,
respectRobotsTxt: true,
userAgent: "WebProxy/0.1",
})),
enabled: z.boolean().default(true),
createdAt: z.coerce.date().default(() => new Date()),
updatedAt: z.coerce.date().default(() => new Date()),
});
export type TopicConfig = z.infer<typeof TopicConfigSchema>;

View File

@ -0,0 +1,57 @@
/**
* @file utils
* @description Shared utility functions
* @layer Shared
*/
import { randomBytes } from "node:crypto";
export function createId(prefix = ""): string {
const id = randomBytes(12).toString("hex");
return prefix ? `${prefix}_${id}` : id;
}
export function formatBytes(bytes: number): string {
if (bytes === 0) return "0 B";
const units = ["B", "KB", "MB", "GB", "TB"];
const i = Math.floor(Math.log(bytes) / Math.log(1024));
const value = bytes / Math.pow(1024, i);
return `${value.toFixed(i > 0 ? 1 : 0)} ${units[i]}`;
}
export function normalizeUrl(url: string): string {
try {
const parsed = new URL(url);
parsed.hash = "";
// Remove trailing slash for non-root paths
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
parsed.pathname = parsed.pathname.slice(0, -1);
}
// Sort query params for consistent caching
parsed.searchParams.sort();
return parsed.toString();
} catch {
return url;
}
}
export function isValidUrl(url: string): boolean {
try {
new URL(url);
return true;
} catch {
return false;
}
}
export function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export function extractDomain(url: string): string {
try {
return new URL(url).hostname;
} catch {
return "";
}
}

3016
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

50
webproxy.config.json Normal file
View File

@ -0,0 +1,50 @@
{
"server": {
"host": "0.0.0.0",
"port": 8080,
"adminPort": 8081
},
"cache": {
"dir": "./data/cache",
"maxSizeBytes": 10737418240,
"maxAge": 86400000,
"cleanupIntervalMs": 3600000
},
"warc": {
"dir": "./data/warc",
"maxFileSize": 1073741824,
"compress": false
},
"search": {
"host": "http://localhost:7700",
"apiKey": "",
"indexName": "webproxy-pages"
},
"topics": [
{
"id": "topic_example",
"name": "Example Topic",
"keywords": ["typescript", "node.js"],
"seedUrls": ["https://nodejs.org/en", "https://www.typescriptlang.org/docs"],
"allowedDomains": ["nodejs.org", "typescriptlang.org"],
"blockedDomains": [],
"schedule": {
"intervalMinutes": 360,
"maxPagesPerCrawl": 50,
"maxDepth": 2,
"respectRobotsTxt": true,
"userAgent": "WebProxy/0.1"
},
"enabled": false
}
],
"proxy": {
"timeout": 30000,
"followRedirects": true,
"maxRedirects": 5,
"allowedPorts": [80, 443, 8080, 8443]
},
"logging": {
"level": "info"
}
}