feat: implement core proxy server, crawler, and indexer modules
packages/shared: - Zod v4 schemas for TopicConfig, ProxyConfig, CrawlJob, SearchQuery - Config loader with defaults - Utility functions (createId, formatBytes, normalizeUrl) packages/core: - WebProxyServer: HTTP forward proxy using http-proxy-3 - CacheStore: LRU-based in-memory + disk cache for proxied responses - WarcWriter: WARC file archiving for all proxied content - HTTPS CONNECT tunneling for SSL passthrough - Admin API with /api/status, /api/cache/stats, /api/config packages/indexer: - TopicCrawler: Crawlee CheerioCrawler for topic-based web crawling - ContentExtractor: @mozilla/readability + turndown for clean text/markdown - SearchClient: MeiliSearch integration for full-text search - CrawlScheduler: Interval-based crawl job scheduling apps/proxy: - Main entry point orchestrating all components - Graceful shutdown handling - Proxy-only mode when no topics configured All packages type-check clean. Next.js build passes. Co-Authored-By: UnicornDev <noreply@unicorndev.wtf>
This commit is contained in:
parent
a2bae5c98e
commit
9386d4a7b3
@ -13,6 +13,12 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@webproxy/core": "workspace:*",
|
"@webproxy/core": "workspace:*",
|
||||||
"@webproxy/indexer": "workspace:*",
|
"@webproxy/indexer": "workspace:*",
|
||||||
"@webproxy/shared": "workspace:*"
|
"@webproxy/shared": "workspace:*",
|
||||||
|
"zod": "^4.3.6"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^20.19.35",
|
||||||
|
"tsx": "^4.21.0",
|
||||||
|
"typescript": "^5.9.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,11 +1,154 @@
|
|||||||
/**
|
/**
|
||||||
* @file index
|
* @file index
|
||||||
* @description WebProxy server entry point
|
* @description WebProxy main entry point - starts proxy, crawler, and admin API
|
||||||
* @layer Application
|
* @layer Application
|
||||||
*
|
*
|
||||||
* Starts the proxy server, indexer, and serves cached content to network devices.
|
* Orchestrates all components: HTTP proxy server, crawl scheduler,
|
||||||
* This is a placeholder — full implementation coming in Phase 2.
|
* search client, and admin API. Handles graceful shutdown.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
console.log("WebProxy v0.1.0 — proxy server placeholder");
|
import { loadConfig, formatBytes, type ProxyConfig } from "@webproxy/shared";
|
||||||
console.log("Run `pnpm dev` in apps/web for the landing page");
|
import { WebProxyServer, Logger } from "@webproxy/core";
|
||||||
|
import { TopicCrawler, CrawlScheduler, SearchClient, ContentExtractor } from "@webproxy/indexer";
|
||||||
|
|
||||||
|
const log = new Logger("WebProxy");
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
log.info("WebProxy v0.1.0 starting...");
|
||||||
|
|
||||||
|
// Load config
|
||||||
|
const configPath = process.argv[2] ?? undefined;
|
||||||
|
let config: ProxyConfig;
|
||||||
|
|
||||||
|
try {
|
||||||
|
config = loadConfig(configPath);
|
||||||
|
log.info("Configuration loaded", {
|
||||||
|
proxy: `${config.server.host}:${config.server.port}`,
|
||||||
|
admin: `${config.server.host}:${config.server.adminPort}`,
|
||||||
|
topics: config.topics.length,
|
||||||
|
cacheDir: config.cache.dir,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
log.error("Failed to load config", { error: String(err) });
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize search client (optional - works without MeiliSearch)
|
||||||
|
const search = new SearchClient({
|
||||||
|
host: config.search.host,
|
||||||
|
apiKey: config.search.apiKey,
|
||||||
|
indexName: config.search.indexName,
|
||||||
|
});
|
||||||
|
|
||||||
|
const searchAvailable = await search.init();
|
||||||
|
if (searchAvailable) {
|
||||||
|
log.info("MeiliSearch connected", { host: config.search.host });
|
||||||
|
} else {
|
||||||
|
log.warn("MeiliSearch not available - search will be disabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize proxy server
|
||||||
|
const proxy = new WebProxyServer(config);
|
||||||
|
|
||||||
|
// Set up event logging
|
||||||
|
proxy.on("request", (event) => {
|
||||||
|
const cached = event.cached ? " [CACHED]" : "";
|
||||||
|
log.info(
|
||||||
|
`${event.method} ${event.statusCode} ${event.url} ${formatBytes(event.size)} ${event.duration}ms${cached}`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
proxy.on("cacheHit", (url, size) => {
|
||||||
|
log.debug(`Cache HIT: ${url} (${formatBytes(size)})`);
|
||||||
|
});
|
||||||
|
|
||||||
|
proxy.on("cacheStore", (url, size) => {
|
||||||
|
log.debug(`Cached: ${url} (${formatBytes(size)})`);
|
||||||
|
});
|
||||||
|
|
||||||
|
proxy.on("error", (err, url) => {
|
||||||
|
log.error(`Error: ${url ?? "unknown"}`, { error: err.message });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Initialize crawler and scheduler
|
||||||
|
const crawler = new TopicCrawler({ search });
|
||||||
|
const extractor = new ContentExtractor();
|
||||||
|
|
||||||
|
const scheduler = new CrawlScheduler(crawler, {
|
||||||
|
onPageCrawled: async (result, extracted) => {
|
||||||
|
// Also store in proxy cache for serving to network devices
|
||||||
|
proxy.cacheStore.set(result.url, {
|
||||||
|
url: result.url,
|
||||||
|
fetchedAt: result.fetchedAt,
|
||||||
|
expiresAt: new Date(Date.now() + config.cache.maxAge),
|
||||||
|
contentType: result.contentType,
|
||||||
|
statusCode: result.statusCode,
|
||||||
|
headers: result.headers,
|
||||||
|
size: result.size,
|
||||||
|
}, Buffer.from(result.rawHtml, "utf-8"));
|
||||||
|
|
||||||
|
// Write to WARC archive
|
||||||
|
await proxy.warcWriter.write({
|
||||||
|
url: result.url,
|
||||||
|
method: "GET",
|
||||||
|
requestHeaders: { "User-Agent": "WebProxy/0.1" },
|
||||||
|
statusCode: result.statusCode,
|
||||||
|
responseHeaders: result.headers,
|
||||||
|
body: Buffer.from(result.rawHtml, "utf-8"),
|
||||||
|
timestamp: result.fetchedAt,
|
||||||
|
});
|
||||||
|
},
|
||||||
|
onJobComplete: (job) => {
|
||||||
|
log.info(`Crawl job ${job.id} completed`, {
|
||||||
|
topic: job.topicId,
|
||||||
|
pages: job.pagesProcessed,
|
||||||
|
failed: job.pagesFailed,
|
||||||
|
bytes: formatBytes(job.bytesDownloaded),
|
||||||
|
});
|
||||||
|
},
|
||||||
|
onError: (err, url) => {
|
||||||
|
log.warn(`Crawl error: ${url}`, { error: err.message });
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add configured topics to scheduler
|
||||||
|
for (const topic of config.topics) {
|
||||||
|
scheduler.addTopic(topic);
|
||||||
|
log.info(`Topic registered: ${topic.name}`, {
|
||||||
|
urls: topic.seedUrls.length,
|
||||||
|
interval: `${topic.schedule.intervalMinutes}m`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start everything
|
||||||
|
await proxy.start();
|
||||||
|
|
||||||
|
if (config.topics.length > 0) {
|
||||||
|
scheduler.start();
|
||||||
|
log.info(`Scheduler started with ${config.topics.length} topics`);
|
||||||
|
} else {
|
||||||
|
log.info("No topics configured - proxy-only mode (add topics in config)");
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("WebProxy is running");
|
||||||
|
log.info(` Proxy: http://${config.server.host}:${config.server.port}`);
|
||||||
|
log.info(` Admin: http://${config.server.host}:${config.server.adminPort}/api/status`);
|
||||||
|
log.info(` Cache: ${config.cache.dir}`);
|
||||||
|
log.info(` WARC: ${config.warc.dir}`);
|
||||||
|
|
||||||
|
// Graceful shutdown
|
||||||
|
const shutdown = async (signal: string) => {
|
||||||
|
log.info(`Received ${signal}, shutting down...`);
|
||||||
|
scheduler.stop();
|
||||||
|
await proxy.stop();
|
||||||
|
process.exit(0);
|
||||||
|
};
|
||||||
|
|
||||||
|
process.on("SIGINT", () => shutdown("SIGINT"));
|
||||||
|
process.on("SIGTERM", () => shutdown("SIGTERM"));
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
log.error("Fatal error", { error: String(err) });
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
|||||||
16
package.json
16
package.json
@ -5,17 +5,27 @@
|
|||||||
"description": "Local internet indexing layer - crawl, cache, and serve web content to your network",
|
"description": "Local internet indexing layer - crawl, cache, and serve web content to your network",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "pnpm --filter @webproxy/web dev",
|
"dev": "pnpm --filter @webproxy/web dev",
|
||||||
|
"dev:proxy": "pnpm --filter @webproxy/proxy dev",
|
||||||
"build": "pnpm --filter @webproxy/web build",
|
"build": "pnpm --filter @webproxy/web build",
|
||||||
"start": "pnpm --filter @webproxy/web start",
|
"start": "pnpm --filter @webproxy/web start",
|
||||||
|
"start:proxy": "pnpm --filter @webproxy/proxy start",
|
||||||
"lint": "pnpm -r lint",
|
"lint": "pnpm -r lint",
|
||||||
"clean": "pnpm -r clean",
|
"clean": "pnpm -r clean",
|
||||||
"typecheck": "pnpm -r typecheck"
|
"typecheck": "pnpm -r --no-bail typecheck"
|
||||||
},
|
},
|
||||||
"keywords": ["proxy", "web-indexer", "cache", "local-network"],
|
"keywords": [
|
||||||
|
"proxy",
|
||||||
|
"web-indexer",
|
||||||
|
"cache",
|
||||||
|
"local-network"
|
||||||
|
],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=20",
|
"node": ">=20",
|
||||||
"pnpm": ">=9"
|
"pnpm": ">=9"
|
||||||
},
|
},
|
||||||
"packageManager": "pnpm@10.24.0"
|
"packageManager": "pnpm@10.24.0",
|
||||||
|
"devDependencies": {
|
||||||
|
"typescript": "^5.9.3"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,10 +5,18 @@
|
|||||||
"main": "./src/index.ts",
|
"main": "./src/index.ts",
|
||||||
"types": "./src/index.ts",
|
"types": "./src/index.ts",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@webproxy/shared": "workspace:*"
|
"@webproxy/shared": "workspace:*",
|
||||||
|
"http-proxy-3": "^1.23.2",
|
||||||
|
"lru-cache": "^11.2.6",
|
||||||
|
"warcio": "^2.4.10",
|
||||||
|
"zod": "^4.3.6"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"typecheck": "tsc --noEmit",
|
"typecheck": "tsc --noEmit",
|
||||||
"clean": "rm -rf dist"
|
"clean": "rm -rf dist"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^25.3.2",
|
||||||
|
"typescript": "^5.9.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
212
packages/core/src/cache-store.ts
Normal file
212
packages/core/src/cache-store.ts
Normal file
@ -0,0 +1,212 @@
|
|||||||
|
/**
|
||||||
|
* @file cache-store
|
||||||
|
* @description In-memory + disk cache for proxied content with LRU eviction
|
||||||
|
* @layer Core
|
||||||
|
*
|
||||||
|
* Manages cached HTTP responses. Uses LRU-cache for hot metadata lookups
|
||||||
|
* and filesystem for response bodies. Tracks cache hit/miss stats.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { LRUCache } from "lru-cache";
|
||||||
|
import { createHash } from "node:crypto";
|
||||||
|
import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync, readdirSync, statSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { normalizeUrl, formatBytes, type CachedPage, type CacheStats } from "@webproxy/shared";
|
||||||
|
import { Logger } from "./logger.js";
|
||||||
|
|
||||||
|
type CacheEntry = {
|
||||||
|
meta: CachedPage;
|
||||||
|
bodyPath: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class CacheStore {
|
||||||
|
private index: LRUCache<string, CacheEntry>;
|
||||||
|
private cacheDir: string;
|
||||||
|
private stats = { hits: 0, misses: 0 };
|
||||||
|
private log: Logger;
|
||||||
|
|
||||||
|
constructor(options: {
|
||||||
|
dir: string;
|
||||||
|
maxSizeBytes: number;
|
||||||
|
maxAge: number;
|
||||||
|
logger?: Logger;
|
||||||
|
}) {
|
||||||
|
this.cacheDir = options.dir;
|
||||||
|
this.log = options.logger ?? new Logger("CacheStore");
|
||||||
|
|
||||||
|
if (!existsSync(this.cacheDir)) {
|
||||||
|
mkdirSync(this.cacheDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
this.index = new LRUCache<string, CacheEntry>({
|
||||||
|
max: 50_000, // max entries in memory index
|
||||||
|
ttl: options.maxAge,
|
||||||
|
dispose: (_value, key) => {
|
||||||
|
this.deleteFromDisk(key);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
this.loadIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
has(url: string): boolean {
|
||||||
|
const key = this.urlToKey(url);
|
||||||
|
const entry = this.index.get(key);
|
||||||
|
if (!entry) return false;
|
||||||
|
|
||||||
|
// Check if expired
|
||||||
|
if (entry.meta.expiresAt < new Date()) {
|
||||||
|
this.index.delete(key);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
get(url: string): { meta: CachedPage; body: Buffer } | null {
|
||||||
|
const key = this.urlToKey(url);
|
||||||
|
const entry = this.index.get(key);
|
||||||
|
|
||||||
|
if (!entry) {
|
||||||
|
this.stats.misses++;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entry.meta.expiresAt < new Date()) {
|
||||||
|
this.index.delete(key);
|
||||||
|
this.stats.misses++;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const body = readFileSync(entry.bodyPath);
|
||||||
|
this.stats.hits++;
|
||||||
|
return { meta: entry.meta, body };
|
||||||
|
} catch {
|
||||||
|
this.index.delete(key);
|
||||||
|
this.stats.misses++;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
set(url: string, meta: Omit<CachedPage, "contentHash">, body: Buffer): void {
|
||||||
|
const key = this.urlToKey(url);
|
||||||
|
const contentHash = createHash("sha256").update(body).digest("hex");
|
||||||
|
const bodyPath = join(this.cacheDir, `${key}.bin`);
|
||||||
|
|
||||||
|
writeFileSync(bodyPath, body);
|
||||||
|
|
||||||
|
const fullMeta: CachedPage = { ...meta, contentHash };
|
||||||
|
|
||||||
|
this.index.set(key, { meta: fullMeta, bodyPath });
|
||||||
|
this.log.debug(`Cached ${formatBytes(body.length)}`, { url });
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(url: string): boolean {
|
||||||
|
const key = this.urlToKey(url);
|
||||||
|
return this.index.delete(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
getStats(): CacheStats {
|
||||||
|
const entries = [...this.index.values()];
|
||||||
|
const total = this.stats.hits + this.stats.misses;
|
||||||
|
|
||||||
|
let totalSize = 0;
|
||||||
|
let oldest: Date | null = null;
|
||||||
|
let newest: Date | null = null;
|
||||||
|
|
||||||
|
for (const entry of entries) {
|
||||||
|
totalSize += entry.meta.size;
|
||||||
|
if (!oldest || entry.meta.fetchedAt < oldest) oldest = entry.meta.fetchedAt;
|
||||||
|
if (!newest || entry.meta.fetchedAt > newest) newest = entry.meta.fetchedAt;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalPages: this.index.size,
|
||||||
|
totalSize,
|
||||||
|
oldestEntry: oldest,
|
||||||
|
newestEntry: newest,
|
||||||
|
hitRate: total > 0 ? this.stats.hits / total : 0,
|
||||||
|
missRate: total > 0 ? this.stats.misses / total : 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
clear(): void {
|
||||||
|
this.index.clear();
|
||||||
|
this.stats = { hits: 0, misses: 0 };
|
||||||
|
this.log.info("Cache cleared");
|
||||||
|
}
|
||||||
|
|
||||||
|
private urlToKey(url: string): string {
|
||||||
|
const normalized = normalizeUrl(url);
|
||||||
|
return createHash("sha256").update(normalized).digest("hex").slice(0, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
private deleteFromDisk(key: string): void {
|
||||||
|
const bodyPath = join(this.cacheDir, `${key}.bin`);
|
||||||
|
try {
|
||||||
|
unlinkSync(bodyPath);
|
||||||
|
} catch {
|
||||||
|
// File may already be gone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private loadIndex(): void {
|
||||||
|
const metaDir = join(this.cacheDir, "meta");
|
||||||
|
if (!existsSync(metaDir)) {
|
||||||
|
mkdirSync(metaDir, { recursive: true });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const files = readdirSync(metaDir).filter((f) => f.endsWith(".json"));
|
||||||
|
let loaded = 0;
|
||||||
|
|
||||||
|
for (const file of files) {
|
||||||
|
try {
|
||||||
|
const raw = readFileSync(join(metaDir, file), "utf-8");
|
||||||
|
const entry: CacheEntry = JSON.parse(raw);
|
||||||
|
entry.meta.fetchedAt = new Date(entry.meta.fetchedAt);
|
||||||
|
entry.meta.expiresAt = new Date(entry.meta.expiresAt);
|
||||||
|
|
||||||
|
if (entry.meta.expiresAt > new Date() && existsSync(entry.bodyPath)) {
|
||||||
|
const key = file.replace(".json", "");
|
||||||
|
this.index.set(key, entry);
|
||||||
|
loaded++;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Skip corrupted entries
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (loaded > 0) {
|
||||||
|
this.log.info(`Loaded ${loaded} cached entries from disk`);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Fresh cache
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
persistIndex(): void {
|
||||||
|
const metaDir = join(this.cacheDir, "meta");
|
||||||
|
if (!existsSync(metaDir)) {
|
||||||
|
mkdirSync(metaDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
let saved = 0;
|
||||||
|
for (const [key, entry] of this.index.entries()) {
|
||||||
|
try {
|
||||||
|
writeFileSync(join(metaDir, `${key}.json`), JSON.stringify(entry));
|
||||||
|
saved++;
|
||||||
|
} catch {
|
||||||
|
// Skip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.log.info(`Persisted ${saved} cache entries`);
|
||||||
|
}
|
||||||
|
|
||||||
|
get size(): number {
|
||||||
|
return this.index.size;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,7 +1,11 @@
|
|||||||
/**
|
/**
|
||||||
* @file index
|
* @file index
|
||||||
* @description Core proxy engine - HTTP handling and caching
|
* @description Core proxy engine - HTTP proxy, caching, WARC storage
|
||||||
* @layer Core
|
* @layer Core
|
||||||
*/
|
*/
|
||||||
|
|
||||||
export { type ProxyConfig, type CachedPage } from "@webproxy/shared";
|
export { WebProxyServer } from "./proxy-server.js";
|
||||||
|
export { CacheStore } from "./cache-store.js";
|
||||||
|
export { WarcWriter } from "./warc-writer.js";
|
||||||
|
export { Logger } from "./logger.js";
|
||||||
|
export type { ProxyEvent, ProxyEventMap } from "./types.js";
|
||||||
|
|||||||
73
packages/core/src/logger.ts
Normal file
73
packages/core/src/logger.ts
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/**
|
||||||
|
* @file logger
|
||||||
|
* @description Structured logger for the proxy
|
||||||
|
* @layer Core
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { LogLevel } from "@webproxy/shared";
|
||||||
|
|
||||||
|
const LEVEL_PRIORITY: Record<LogLevel, number> = {
|
||||||
|
debug: 0,
|
||||||
|
info: 1,
|
||||||
|
warn: 2,
|
||||||
|
error: 3,
|
||||||
|
};
|
||||||
|
|
||||||
|
const LEVEL_COLOR: Record<LogLevel, string> = {
|
||||||
|
debug: "\x1b[90m",
|
||||||
|
info: "\x1b[36m",
|
||||||
|
warn: "\x1b[33m",
|
||||||
|
error: "\x1b[31m",
|
||||||
|
};
|
||||||
|
|
||||||
|
const RESET = "\x1b[0m";
|
||||||
|
|
||||||
|
export class Logger {
|
||||||
|
private minLevel: number;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
private context: string,
|
||||||
|
level: LogLevel = "info",
|
||||||
|
) {
|
||||||
|
this.minLevel = LEVEL_PRIORITY[level];
|
||||||
|
}
|
||||||
|
|
||||||
|
private log(level: LogLevel, message: string, data?: Record<string, unknown>) {
|
||||||
|
if (LEVEL_PRIORITY[level] < this.minLevel) return;
|
||||||
|
|
||||||
|
const timestamp = new Date().toISOString();
|
||||||
|
const color = LEVEL_COLOR[level];
|
||||||
|
const prefix = `${color}[${timestamp}] [${level.toUpperCase()}] [${this.context}]${RESET}`;
|
||||||
|
|
||||||
|
if (data) {
|
||||||
|
console.log(`${prefix} ${message}`, data);
|
||||||
|
} else {
|
||||||
|
console.log(`${prefix} ${message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug(message: string, data?: Record<string, unknown>) {
|
||||||
|
this.log("debug", message, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
info(message: string, data?: Record<string, unknown>) {
|
||||||
|
this.log("info", message, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
warn(message: string, data?: Record<string, unknown>) {
|
||||||
|
this.log("warn", message, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
error(message: string, data?: Record<string, unknown>) {
|
||||||
|
this.log("error", message, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
child(context: string): Logger {
|
||||||
|
return new Logger(`${this.context}:${context}`, this.levelName());
|
||||||
|
}
|
||||||
|
|
||||||
|
private levelName(): LogLevel {
|
||||||
|
const entry = Object.entries(LEVEL_PRIORITY).find(([, v]) => v === this.minLevel);
|
||||||
|
return (entry?.[0] as LogLevel) ?? "info";
|
||||||
|
}
|
||||||
|
}
|
||||||
375
packages/core/src/proxy-server.ts
Normal file
375
packages/core/src/proxy-server.ts
Normal file
@ -0,0 +1,375 @@
|
|||||||
|
/**
|
||||||
|
* @file proxy-server
|
||||||
|
* @description Forward HTTP proxy server using http-proxy-3
|
||||||
|
* @layer Core
|
||||||
|
*
|
||||||
|
* Acts as a forward proxy for network devices. Intercepts HTTP requests,
|
||||||
|
* checks local cache, serves cached content or fetches from upstream.
|
||||||
|
* Archives responses to WARC and indexes content for search.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { createServer, Server, IncomingMessage, ServerResponse } from "node:http";
|
||||||
|
import { createProxyServer, type ProxyServer as HttpProxy } from "http-proxy-3";
|
||||||
|
import { EventEmitter } from "node:events";
|
||||||
|
import { createHash } from "node:crypto";
|
||||||
|
import { connect as netConnect, type Socket } from "node:net";
|
||||||
|
import { type ProxyConfig, formatBytes } from "@webproxy/shared";
|
||||||
|
import { CacheStore } from "./cache-store.js";
|
||||||
|
import { WarcWriter, type WarcRecord } from "./warc-writer.js";
|
||||||
|
import { Logger } from "./logger.js";
|
||||||
|
import type { ProxyEvent, ProxyEventMap } from "./types.js";
|
||||||
|
|
||||||
|
export class WebProxyServer extends EventEmitter<ProxyEventMap> {
|
||||||
|
private server: Server | null = null;
|
||||||
|
private proxy: HttpProxy;
|
||||||
|
private cache: CacheStore;
|
||||||
|
private warc: WarcWriter;
|
||||||
|
private log: Logger;
|
||||||
|
private config: ProxyConfig;
|
||||||
|
private adminServer: Server | null = null;
|
||||||
|
|
||||||
|
constructor(config: ProxyConfig) {
|
||||||
|
super();
|
||||||
|
this.config = config;
|
||||||
|
this.log = new Logger("Proxy", config.logging.level);
|
||||||
|
|
||||||
|
this.cache = new CacheStore({
|
||||||
|
dir: config.cache.dir,
|
||||||
|
maxSizeBytes: config.cache.maxSizeBytes,
|
||||||
|
maxAge: config.cache.maxAge,
|
||||||
|
logger: this.log.child("cache"),
|
||||||
|
});
|
||||||
|
|
||||||
|
this.warc = new WarcWriter({
|
||||||
|
dir: config.warc.dir,
|
||||||
|
maxFileSize: config.warc.maxFileSize,
|
||||||
|
compress: config.warc.compress,
|
||||||
|
logger: this.log.child("warc"),
|
||||||
|
});
|
||||||
|
|
||||||
|
this.proxy = createProxyServer({
|
||||||
|
changeOrigin: true,
|
||||||
|
selfHandleResponse: true,
|
||||||
|
proxyTimeout: config.proxy.timeout,
|
||||||
|
followRedirects: config.proxy.followRedirects,
|
||||||
|
});
|
||||||
|
|
||||||
|
this.setupProxyEvents();
|
||||||
|
}
|
||||||
|
|
||||||
|
async start(): Promise<void> {
|
||||||
|
const { host, port, adminPort } = this.config.server;
|
||||||
|
|
||||||
|
// Main proxy server
|
||||||
|
this.server = createServer((req, res) => this.handleRequest(req, res));
|
||||||
|
|
||||||
|
this.server.on("connect", (req: IncomingMessage, socket: Socket, head: Buffer) => {
|
||||||
|
this.handleConnect(req, socket, head);
|
||||||
|
});
|
||||||
|
|
||||||
|
await new Promise<void>((resolve) => {
|
||||||
|
this.server!.listen(port, host, () => {
|
||||||
|
this.log.info(`Proxy server listening on ${host}:${port}`);
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Admin API server
|
||||||
|
this.adminServer = createServer((req, res) => this.handleAdminRequest(req, res));
|
||||||
|
|
||||||
|
await new Promise<void>((resolve) => {
|
||||||
|
this.adminServer!.listen(adminPort, host, () => {
|
||||||
|
this.log.info(`Admin API listening on ${host}:${adminPort}`);
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
this.emit("start");
|
||||||
|
}
|
||||||
|
|
||||||
|
async stop(): Promise<void> {
|
||||||
|
this.log.info("Shutting down...");
|
||||||
|
|
||||||
|
// Persist cache index
|
||||||
|
this.cache.persistIndex();
|
||||||
|
|
||||||
|
// Close WARC writer
|
||||||
|
await this.warc.close();
|
||||||
|
|
||||||
|
// Close servers
|
||||||
|
await Promise.all([
|
||||||
|
new Promise<void>((resolve) => {
|
||||||
|
if (this.server) this.server.close(() => resolve());
|
||||||
|
else resolve();
|
||||||
|
}),
|
||||||
|
new Promise<void>((resolve) => {
|
||||||
|
if (this.adminServer) this.adminServer.close(() => resolve());
|
||||||
|
else resolve();
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
|
||||||
|
this.proxy.close();
|
||||||
|
this.emit("stop");
|
||||||
|
this.log.info("Shutdown complete");
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleRequest(req: IncomingMessage, res: ServerResponse): Promise<void> {
|
||||||
|
const url = req.url ?? "";
|
||||||
|
const method = req.method ?? "GET";
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
// Only cache GET requests
|
||||||
|
if (method === "GET" && this.cache.has(url)) {
|
||||||
|
const cached = this.cache.get(url);
|
||||||
|
if (cached) {
|
||||||
|
const duration = Date.now() - startTime;
|
||||||
|
this.log.debug(`Cache HIT: ${url} (${formatBytes(cached.meta.size)})`);
|
||||||
|
|
||||||
|
// Write cached response
|
||||||
|
res.writeHead(cached.meta.statusCode, cached.meta.headers);
|
||||||
|
res.end(cached.body);
|
||||||
|
|
||||||
|
this.emitEvent({
|
||||||
|
url,
|
||||||
|
method,
|
||||||
|
statusCode: cached.meta.statusCode,
|
||||||
|
contentType: cached.meta.contentType,
|
||||||
|
size: cached.meta.size,
|
||||||
|
cached: true,
|
||||||
|
duration,
|
||||||
|
timestamp: new Date(),
|
||||||
|
});
|
||||||
|
|
||||||
|
this.emit("cacheHit", url, cached.meta.size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward to upstream
|
||||||
|
try {
|
||||||
|
const target = this.resolveTarget(url);
|
||||||
|
if (!target) {
|
||||||
|
res.writeHead(400, { "Content-Type": "text/plain" });
|
||||||
|
res.end("Bad Request: invalid URL");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.proxy.web(req, res, { target });
|
||||||
|
} catch (err) {
|
||||||
|
this.log.error(`Proxy error: ${url}`, { error: String(err) });
|
||||||
|
this.emit("error", err instanceof Error ? err : new Error(String(err)), url);
|
||||||
|
|
||||||
|
if (!res.headersSent) {
|
||||||
|
res.writeHead(502, { "Content-Type": "text/plain" });
|
||||||
|
res.end("Bad Gateway");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleConnect(
|
||||||
|
req: IncomingMessage,
|
||||||
|
clientSocket: Socket,
|
||||||
|
_head: Buffer,
|
||||||
|
): void {
|
||||||
|
// HTTPS CONNECT method — tunnel the connection
|
||||||
|
const [hostname, port] = (req.url ?? "").split(":");
|
||||||
|
const serverPort = parseInt(port, 10) || 443;
|
||||||
|
|
||||||
|
this.log.debug(`CONNECT tunnel: ${hostname}:${serverPort}`);
|
||||||
|
|
||||||
|
const serverSocket = netConnect(serverPort, hostname, () => {
|
||||||
|
clientSocket.write(
|
||||||
|
"HTTP/1.1 200 Connection Established\r\nProxy-Agent: WebProxy\r\n\r\n",
|
||||||
|
);
|
||||||
|
serverSocket.pipe(clientSocket);
|
||||||
|
clientSocket.pipe(serverSocket);
|
||||||
|
});
|
||||||
|
|
||||||
|
serverSocket.on("error", (err) => {
|
||||||
|
this.log.error(`CONNECT error: ${hostname}:${serverPort}`, { error: err.message });
|
||||||
|
clientSocket.end("HTTP/1.1 502 Bad Gateway\r\n\r\n");
|
||||||
|
});
|
||||||
|
|
||||||
|
clientSocket.on("error", () => {
|
||||||
|
serverSocket.destroy();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private setupProxyEvents(): void {
|
||||||
|
// Capture the response from upstream
|
||||||
|
this.proxy.on("proxyRes", (proxyRes, req, res) => {
|
||||||
|
const url = req.url ?? "";
|
||||||
|
const method = req.method ?? "GET";
|
||||||
|
const statusCode = proxyRes.statusCode ?? 500;
|
||||||
|
const contentType = proxyRes.headers["content-type"] ?? "application/octet-stream";
|
||||||
|
const startTime = Date.now();
|
||||||
|
const chunks: Buffer[] = [];
|
||||||
|
|
||||||
|
proxyRes.on("data", (chunk: Buffer) => {
|
||||||
|
chunks.push(chunk);
|
||||||
|
});
|
||||||
|
|
||||||
|
proxyRes.on("end", () => {
|
||||||
|
const body = Buffer.concat(chunks);
|
||||||
|
const duration = Date.now() - startTime;
|
||||||
|
|
||||||
|
// Forward response to client
|
||||||
|
const headers: Record<string, string | string[] | undefined> = {};
|
||||||
|
for (const [key, value] of Object.entries(proxyRes.headers)) {
|
||||||
|
headers[key] = value;
|
||||||
|
}
|
||||||
|
res.writeHead(statusCode, headers);
|
||||||
|
res.end(body);
|
||||||
|
|
||||||
|
// Cache successful GET responses
|
||||||
|
if (method === "GET" && statusCode >= 200 && statusCode < 400) {
|
||||||
|
const maxAge = this.parseMaxAge(proxyRes.headers["cache-control"]);
|
||||||
|
const expiresAt = new Date(Date.now() + (maxAge ?? this.config.cache.maxAge));
|
||||||
|
|
||||||
|
const flatHeaders: Record<string, string> = {};
|
||||||
|
for (const [k, v] of Object.entries(proxyRes.headers)) {
|
||||||
|
if (typeof v === "string") flatHeaders[k] = v;
|
||||||
|
else if (Array.isArray(v)) flatHeaders[k] = v.join(", ");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.cache.set(url, {
|
||||||
|
url,
|
||||||
|
fetchedAt: new Date(),
|
||||||
|
expiresAt,
|
||||||
|
contentType,
|
||||||
|
statusCode,
|
||||||
|
headers: flatHeaders,
|
||||||
|
size: body.length,
|
||||||
|
}, body);
|
||||||
|
|
||||||
|
this.emit("cacheStore", url, body.length);
|
||||||
|
|
||||||
|
// Write WARC record in background
|
||||||
|
const warcRecord: WarcRecord = {
|
||||||
|
url,
|
||||||
|
method,
|
||||||
|
requestHeaders: this.extractHeaders(req),
|
||||||
|
statusCode,
|
||||||
|
responseHeaders: flatHeaders,
|
||||||
|
body,
|
||||||
|
timestamp: new Date(),
|
||||||
|
};
|
||||||
|
this.warc.write(warcRecord).catch((err) => {
|
||||||
|
this.log.error("WARC write failed", { error: String(err), url });
|
||||||
|
});
|
||||||
|
|
||||||
|
this.emit("cacheMiss", url);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.emitEvent({
|
||||||
|
url,
|
||||||
|
method,
|
||||||
|
statusCode,
|
||||||
|
contentType,
|
||||||
|
size: body.length,
|
||||||
|
cached: false,
|
||||||
|
duration,
|
||||||
|
timestamp: new Date(),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
this.proxy.on("error", (err, req, res) => {
|
||||||
|
const url = (req as IncomingMessage).url ?? "unknown";
|
||||||
|
this.log.error(`Upstream error: ${url}`, { error: String(err) });
|
||||||
|
this.emit("error", err instanceof Error ? err : new Error(String(err)), url);
|
||||||
|
|
||||||
|
if (res instanceof ServerResponse && !res.headersSent) {
|
||||||
|
res.writeHead(502, { "Content-Type": "text/plain" });
|
||||||
|
res.end("Bad Gateway: upstream server error");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleAdminRequest(req: IncomingMessage, res: ServerResponse): Promise<void> {
|
||||||
|
const url = new URL(req.url ?? "/", `http://${req.headers.host}`);
|
||||||
|
const path = url.pathname;
|
||||||
|
|
||||||
|
// CORS
|
||||||
|
res.setHeader("Access-Control-Allow-Origin", "*");
|
||||||
|
res.setHeader("Access-Control-Allow-Methods", "GET, POST, DELETE, OPTIONS");
|
||||||
|
res.setHeader("Access-Control-Allow-Headers", "Content-Type");
|
||||||
|
|
||||||
|
if (req.method === "OPTIONS") {
|
||||||
|
res.writeHead(204);
|
||||||
|
res.end();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (path === "/api/status" && req.method === "GET") {
|
||||||
|
const stats = this.cache.getStats();
|
||||||
|
this.sendJson(res, {
|
||||||
|
status: "running",
|
||||||
|
cache: stats,
|
||||||
|
warc: { totalRecords: this.warc.totalRecords },
|
||||||
|
uptime: process.uptime(),
|
||||||
|
});
|
||||||
|
} else if (path === "/api/cache/stats" && req.method === "GET") {
|
||||||
|
this.sendJson(res, this.cache.getStats());
|
||||||
|
} else if (path === "/api/cache/clear" && req.method === "POST") {
|
||||||
|
this.cache.clear();
|
||||||
|
this.sendJson(res, { cleared: true });
|
||||||
|
} else if (path === "/api/config" && req.method === "GET") {
|
||||||
|
this.sendJson(res, {
|
||||||
|
server: this.config.server,
|
||||||
|
cache: { dir: this.config.cache.dir, maxSizeBytes: this.config.cache.maxSizeBytes },
|
||||||
|
topics: this.config.topics.length,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
res.writeHead(404, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ error: "Not Found" }));
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
res.writeHead(500, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ error: String(err) }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private sendJson(res: ServerResponse, data: unknown): void {
|
||||||
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify(data, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
private resolveTarget(url: string): string | null {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(url);
|
||||||
|
return `${parsed.protocol}//${parsed.host}`;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private parseMaxAge(cacheControl?: string): number | null {
|
||||||
|
if (!cacheControl) return null;
|
||||||
|
const match = cacheControl.match(/max-age=(\d+)/);
|
||||||
|
if (!match) return null;
|
||||||
|
return parseInt(match[1], 10) * 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractHeaders(req: IncomingMessage): Record<string, string> {
|
||||||
|
const headers: Record<string, string> = {};
|
||||||
|
for (const [key, value] of Object.entries(req.headers)) {
|
||||||
|
if (typeof value === "string") headers[key] = value;
|
||||||
|
else if (Array.isArray(value)) headers[key] = value.join(", ");
|
||||||
|
}
|
||||||
|
return headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
private emitEvent(event: ProxyEvent): void {
|
||||||
|
this.emit("request", event);
|
||||||
|
}
|
||||||
|
|
||||||
|
get cacheStore(): CacheStore {
|
||||||
|
return this.cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
get warcWriter(): WarcWriter {
|
||||||
|
return this.warc;
|
||||||
|
}
|
||||||
|
}
|
||||||
26
packages/core/src/types.ts
Normal file
26
packages/core/src/types.ts
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
/**
|
||||||
|
* @file types
|
||||||
|
* @description Core-specific types
|
||||||
|
* @layer Core
|
||||||
|
*/
|
||||||
|
|
||||||
|
export type ProxyEvent = {
|
||||||
|
url: string;
|
||||||
|
method: string;
|
||||||
|
statusCode: number;
|
||||||
|
contentType: string;
|
||||||
|
size: number;
|
||||||
|
cached: boolean;
|
||||||
|
duration: number;
|
||||||
|
timestamp: Date;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type ProxyEventMap = {
|
||||||
|
request: [event: ProxyEvent];
|
||||||
|
cacheHit: [url: string, size: number];
|
||||||
|
cacheMiss: [url: string];
|
||||||
|
cacheStore: [url: string, size: number];
|
||||||
|
error: [error: Error, url?: string];
|
||||||
|
start: [];
|
||||||
|
stop: [];
|
||||||
|
};
|
||||||
142
packages/core/src/warc-writer.ts
Normal file
142
packages/core/src/warc-writer.ts
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
/**
|
||||||
|
* @file warc-writer
|
||||||
|
* @description WARC file writer for archiving proxied content
|
||||||
|
* @layer Core
|
||||||
|
*
|
||||||
|
* Writes HTTP request/response pairs to WARC files using the warcio.js library.
|
||||||
|
* WARC (Web ARChive) is the standard format for web archiving.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { createWriteStream, existsSync, mkdirSync, statSync, WriteStream } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { Logger } from "./logger.js";
|
||||||
|
|
||||||
|
export type WarcRecord = {
|
||||||
|
url: string;
|
||||||
|
method: string;
|
||||||
|
requestHeaders: Record<string, string>;
|
||||||
|
statusCode: number;
|
||||||
|
responseHeaders: Record<string, string>;
|
||||||
|
body: Buffer;
|
||||||
|
timestamp: Date;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class WarcWriter {
|
||||||
|
private warcDir: string;
|
||||||
|
private maxFileSize: number;
|
||||||
|
private currentFile: WriteStream | null = null;
|
||||||
|
private currentFilePath = "";
|
||||||
|
private currentSize = 0;
|
||||||
|
private fileIndex = 0;
|
||||||
|
private log: Logger;
|
||||||
|
private recordCount = 0;
|
||||||
|
|
||||||
|
constructor(options: {
|
||||||
|
dir: string;
|
||||||
|
maxFileSize: number;
|
||||||
|
compress?: boolean;
|
||||||
|
logger?: Logger;
|
||||||
|
}) {
|
||||||
|
this.warcDir = options.dir;
|
||||||
|
this.maxFileSize = options.maxFileSize;
|
||||||
|
this.log = options.logger ?? new Logger("WarcWriter");
|
||||||
|
|
||||||
|
if (!existsSync(this.warcDir)) {
|
||||||
|
mkdirSync(this.warcDir, { recursive: true });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async write(record: WarcRecord): Promise<string> {
|
||||||
|
await this.ensureFile();
|
||||||
|
|
||||||
|
const warcRecord = this.formatWarcRecord(record);
|
||||||
|
const data = Buffer.from(warcRecord, "utf-8");
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
if (!this.currentFile) {
|
||||||
|
reject(new Error("No WARC file open"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.currentFile.write(data, (err) => {
|
||||||
|
if (err) {
|
||||||
|
reject(err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.currentSize += data.length;
|
||||||
|
this.recordCount++;
|
||||||
|
this.log.debug(`WARC record written`, { url: record.url, size: data.length });
|
||||||
|
resolve(this.currentFilePath);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private formatWarcRecord(record: WarcRecord): string {
|
||||||
|
const warcId = `<urn:uuid:${crypto.randomUUID()}>`;
|
||||||
|
const warcDate = record.timestamp.toISOString();
|
||||||
|
|
||||||
|
// Format request headers
|
||||||
|
const reqHeaderLines = Object.entries(record.requestHeaders)
|
||||||
|
.map(([k, v]) => `${k}: ${v}`)
|
||||||
|
.join("\r\n");
|
||||||
|
const requestBlock = `${record.method} ${new URL(record.url).pathname} HTTP/1.1\r\n${reqHeaderLines}\r\n\r\n`;
|
||||||
|
|
||||||
|
// Format response headers
|
||||||
|
const resHeaderLines = Object.entries(record.responseHeaders)
|
||||||
|
.map(([k, v]) => `${k}: ${v}`)
|
||||||
|
.join("\r\n");
|
||||||
|
const statusLine = `HTTP/1.1 ${record.statusCode} OK`;
|
||||||
|
const responseBlock = `${statusLine}\r\n${resHeaderLines}\r\n\r\n`;
|
||||||
|
const responsePayload = responseBlock + record.body.toString("utf-8");
|
||||||
|
|
||||||
|
// WARC response record
|
||||||
|
const warcHeaders = [
|
||||||
|
`WARC/1.1`,
|
||||||
|
`WARC-Type: response`,
|
||||||
|
`WARC-Date: ${warcDate}`,
|
||||||
|
`WARC-Target-URI: ${record.url}`,
|
||||||
|
`WARC-Record-ID: ${warcId}`,
|
||||||
|
`Content-Type: application/http;msgtype=response`,
|
||||||
|
`Content-Length: ${Buffer.byteLength(responsePayload, "utf-8")}`,
|
||||||
|
].join("\r\n");
|
||||||
|
|
||||||
|
return `${warcHeaders}\r\n\r\n${responsePayload}\r\n\r\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async ensureFile(): Promise<void> {
|
||||||
|
if (this.currentFile && this.currentSize < this.maxFileSize) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.rotateFile();
|
||||||
|
}
|
||||||
|
|
||||||
|
private async rotateFile(): Promise<void> {
|
||||||
|
if (this.currentFile) {
|
||||||
|
await new Promise<void>((resolve) => {
|
||||||
|
this.currentFile!.end(() => resolve());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||||
|
this.currentFilePath = join(this.warcDir, `webproxy-${timestamp}-${this.fileIndex}.warc`);
|
||||||
|
this.currentFile = createWriteStream(this.currentFilePath, { flags: "a" });
|
||||||
|
this.currentSize = 0;
|
||||||
|
this.fileIndex++;
|
||||||
|
this.log.info(`New WARC file: ${this.currentFilePath}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async close(): Promise<void> {
|
||||||
|
if (this.currentFile) {
|
||||||
|
await new Promise<void>((resolve) => {
|
||||||
|
this.currentFile!.end(() => resolve());
|
||||||
|
});
|
||||||
|
this.currentFile = null;
|
||||||
|
}
|
||||||
|
this.log.info(`WARC writer closed. Total records: ${this.recordCount}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
get totalRecords(): number {
|
||||||
|
return this.recordCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -5,10 +5,24 @@
|
|||||||
"main": "./src/index.ts",
|
"main": "./src/index.ts",
|
||||||
"types": "./src/index.ts",
|
"types": "./src/index.ts",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@webproxy/shared": "workspace:*"
|
"@crawlee/cheerio": "^3.16.0",
|
||||||
|
"@crawlee/http": "^3.16.0",
|
||||||
|
"@mozilla/readability": "^0.6.0",
|
||||||
|
"@webproxy/shared": "workspace:*",
|
||||||
|
"cheerio": "^1.2.0",
|
||||||
|
"crawlee": "^3.16.0",
|
||||||
|
"meilisearch": "^0.55.0",
|
||||||
|
"turndown": "^7.2.2",
|
||||||
|
"zod": "^4.3.6"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"typecheck": "tsc --noEmit",
|
"typecheck": "tsc --noEmit",
|
||||||
"clean": "rm -rf dist"
|
"clean": "rm -rf dist"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^25.3.2",
|
||||||
|
"@types/turndown": "^5.0.6",
|
||||||
|
"jsdom": "^28.1.0",
|
||||||
|
"typescript": "^5.9.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
208
packages/indexer/src/crawler.ts
Normal file
208
packages/indexer/src/crawler.ts
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
/**
|
||||||
|
* @file crawler
|
||||||
|
* @description Topic-based web crawler using Crawlee's CheerioCrawler
|
||||||
|
* @layer Indexer
|
||||||
|
*
|
||||||
|
* Crawls seed URLs for configured topics, extracts content,
|
||||||
|
* indexes in search engine, and stores in cache/WARC.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { CheerioCrawler, type CheerioCrawlingContext, Configuration } from "crawlee";
|
||||||
|
import { type TopicConfig, type CrawlJob, type CrawlResult, createId } from "@webproxy/shared";
|
||||||
|
import { ContentExtractor, type ExtractedContent } from "./extractor.js";
|
||||||
|
import { SearchClient, type SearchDocument } from "./search-client.js";
|
||||||
|
|
||||||
|
export type CrawlCallbacks = {
|
||||||
|
onPageCrawled?: (result: CrawlResult, extracted: ExtractedContent | null) => void | Promise<void>;
|
||||||
|
onJobComplete?: (job: CrawlJob) => void;
|
||||||
|
onError?: (error: Error, url: string) => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class TopicCrawler {
|
||||||
|
private extractor: ContentExtractor;
|
||||||
|
private search: SearchClient | null;
|
||||||
|
private activeCrawlers = new Map<string, CheerioCrawler>();
|
||||||
|
|
||||||
|
constructor(options: {
|
||||||
|
search?: SearchClient;
|
||||||
|
} = {}) {
|
||||||
|
this.extractor = new ContentExtractor();
|
||||||
|
this.search = options.search ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async crawlTopic(topic: TopicConfig, callbacks?: CrawlCallbacks): Promise<CrawlJob> {
|
||||||
|
const job: CrawlJob = {
|
||||||
|
id: createId("crawl"),
|
||||||
|
topicId: topic.id,
|
||||||
|
status: "running",
|
||||||
|
startedAt: new Date(),
|
||||||
|
pagesProcessed: 0,
|
||||||
|
pagesFailed: 0,
|
||||||
|
bytesDownloaded: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (topic.seedUrls.length === 0) {
|
||||||
|
job.status = "completed";
|
||||||
|
job.completedAt = new Date();
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configure Crawlee to use a local storage dir
|
||||||
|
const config = Configuration.getGlobalConfig();
|
||||||
|
config.set("persistStorage", false);
|
||||||
|
|
||||||
|
const crawler = new CheerioCrawler({
|
||||||
|
maxRequestsPerCrawl: topic.schedule.maxPagesPerCrawl,
|
||||||
|
maxConcurrency: 5,
|
||||||
|
requestHandlerTimeoutSecs: 30,
|
||||||
|
maxRequestRetries: 2,
|
||||||
|
|
||||||
|
async requestHandler(context: CheerioCrawlingContext) {
|
||||||
|
const { request, $, body, response } = context;
|
||||||
|
const url = request.loadedUrl ?? request.url;
|
||||||
|
const statusCode = response?.statusCode ?? 200;
|
||||||
|
const contentType = response?.headers?.["content-type"] ?? "text/html";
|
||||||
|
const rawHtml = typeof body === "string" ? body : body.toString();
|
||||||
|
|
||||||
|
// Extract content
|
||||||
|
const extracted = self.extractor.extract(rawHtml, url);
|
||||||
|
|
||||||
|
const result: CrawlResult = {
|
||||||
|
url,
|
||||||
|
statusCode,
|
||||||
|
contentType,
|
||||||
|
content: extracted?.textContent ?? $("body").text().trim(),
|
||||||
|
rawHtml,
|
||||||
|
title: extracted?.title ?? $("title").text().trim() ?? url,
|
||||||
|
excerpt: extracted?.excerpt ?? "",
|
||||||
|
byline: extracted?.byline ?? null,
|
||||||
|
fetchedAt: new Date(),
|
||||||
|
headers: (response?.headers as Record<string, string>) ?? {},
|
||||||
|
size: Buffer.byteLength(rawHtml, "utf-8"),
|
||||||
|
};
|
||||||
|
|
||||||
|
job.pagesProcessed++;
|
||||||
|
job.bytesDownloaded += result.size;
|
||||||
|
|
||||||
|
// Index in search engine
|
||||||
|
if (self.search && extracted) {
|
||||||
|
const doc = self.extractor.toSearchDocument(url, extracted, topic.id);
|
||||||
|
await self.search.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Callback
|
||||||
|
if (callbacks?.onPageCrawled) {
|
||||||
|
await callbacks.onPageCrawled(result, extracted);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enqueue discovered links (respect depth and domain restrictions)
|
||||||
|
const depth = (request.userData?.depth ?? 0) as number;
|
||||||
|
if (depth < topic.schedule.maxDepth) {
|
||||||
|
const links: string[] = [];
|
||||||
|
$("a[href]").each((_i, el) => {
|
||||||
|
const href = $(el).attr("href");
|
||||||
|
if (href) {
|
||||||
|
try {
|
||||||
|
const resolved = new URL(href, url).toString();
|
||||||
|
if (self.shouldFollowUrl(resolved, topic)) {
|
||||||
|
links.push(resolved);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Invalid URL, skip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (links.length > 0) {
|
||||||
|
await context.addRequests(
|
||||||
|
links.slice(0, 50).map((link) => ({
|
||||||
|
url: link,
|
||||||
|
userData: { depth: depth + 1, topicId: topic.id },
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
failedRequestHandler({ request }, error) {
|
||||||
|
job.pagesFailed++;
|
||||||
|
callbacks?.onError?.(error as Error, request.url);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Capture `this` for use in the handler
|
||||||
|
const self = this;
|
||||||
|
|
||||||
|
this.activeCrawlers.set(topic.id, crawler);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await crawler.run(
|
||||||
|
topic.seedUrls.map((url) => ({
|
||||||
|
url,
|
||||||
|
userData: { depth: 0, topicId: topic.id },
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
|
||||||
|
job.status = "completed";
|
||||||
|
job.completedAt = new Date();
|
||||||
|
} catch (err) {
|
||||||
|
job.status = "failed";
|
||||||
|
job.error = String(err);
|
||||||
|
job.completedAt = new Date();
|
||||||
|
} finally {
|
||||||
|
this.activeCrawlers.delete(topic.id);
|
||||||
|
callbacks?.onJobComplete?.(job);
|
||||||
|
}
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
private shouldFollowUrl(url: string, topic: TopicConfig): boolean {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(url);
|
||||||
|
|
||||||
|
// Skip non-http
|
||||||
|
if (!parsed.protocol.startsWith("http")) return false;
|
||||||
|
|
||||||
|
// Skip common non-content paths
|
||||||
|
const skip = ["/login", "/signup", "/register", "/cart", "/checkout", "/api/"];
|
||||||
|
if (skip.some((s) => parsed.pathname.startsWith(s))) return false;
|
||||||
|
|
||||||
|
// Skip file extensions we don't want
|
||||||
|
const skipExts = [".pdf", ".zip", ".tar", ".gz", ".jpg", ".png", ".gif", ".mp4", ".mp3"];
|
||||||
|
if (skipExts.some((ext) => parsed.pathname.endsWith(ext))) return false;
|
||||||
|
|
||||||
|
// Domain restrictions
|
||||||
|
if (topic.allowedDomains.length > 0) {
|
||||||
|
const allowed = topic.allowedDomains.some(
|
||||||
|
(d) => parsed.hostname === d || parsed.hostname.endsWith(`.${d}`),
|
||||||
|
);
|
||||||
|
if (!allowed) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (topic.blockedDomains.length > 0) {
|
||||||
|
const blocked = topic.blockedDomains.some(
|
||||||
|
(d) => parsed.hostname === d || parsed.hostname.endsWith(`.${d}`),
|
||||||
|
);
|
||||||
|
if (blocked) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cancelCrawl(topicId: string): boolean {
|
||||||
|
const crawler = this.activeCrawlers.get(topicId);
|
||||||
|
if (!crawler) return false;
|
||||||
|
|
||||||
|
// Crawlee doesn't have a direct cancel, but we can stop adding requests
|
||||||
|
this.activeCrawlers.delete(topicId);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
get activeTopics(): string[] {
|
||||||
|
return [...this.activeCrawlers.keys()];
|
||||||
|
}
|
||||||
|
}
|
||||||
108
packages/indexer/src/extractor.ts
Normal file
108
packages/indexer/src/extractor.ts
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
/**
|
||||||
|
* @file extractor
|
||||||
|
* @description Content extraction using Readability + Turndown for clean text/markdown
|
||||||
|
* @layer Indexer
|
||||||
|
*
|
||||||
|
* Takes raw HTML and extracts readable article content,
|
||||||
|
* then optionally converts to Markdown for AI/search indexing.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Readability } from "@mozilla/readability";
|
||||||
|
import TurndownService from "turndown";
|
||||||
|
import { JSDOM } from "jsdom";
|
||||||
|
import type { CrawlResult } from "@webproxy/shared";
|
||||||
|
|
||||||
|
export type ExtractedContent = {
|
||||||
|
title: string;
|
||||||
|
content: string;
|
||||||
|
textContent: string;
|
||||||
|
markdown: string;
|
||||||
|
excerpt: string;
|
||||||
|
byline: string | null;
|
||||||
|
length: number;
|
||||||
|
siteName: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class ContentExtractor {
|
||||||
|
private turndown: TurndownService;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.turndown = new TurndownService({
|
||||||
|
headingStyle: "atx",
|
||||||
|
codeBlockStyle: "fenced",
|
||||||
|
bulletListMarker: "-",
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove script, style, nav elements
|
||||||
|
this.turndown.remove(["script", "style", "nav", "footer", "header"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
extract(html: string, url: string): ExtractedContent | null {
|
||||||
|
try {
|
||||||
|
const dom = new JSDOM(html, { url });
|
||||||
|
const doc = dom.window.document;
|
||||||
|
|
||||||
|
const reader = new Readability(doc);
|
||||||
|
const article = reader.parse();
|
||||||
|
|
||||||
|
if (!article || !article.content) {
|
||||||
|
return this.fallbackExtract(html, url, dom);
|
||||||
|
}
|
||||||
|
|
||||||
|
const markdown = this.turndown.turndown(article.content);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: article.title ?? "",
|
||||||
|
content: article.content,
|
||||||
|
textContent: article.textContent ?? "",
|
||||||
|
markdown,
|
||||||
|
excerpt: article.excerpt ?? "",
|
||||||
|
byline: article.byline ?? null,
|
||||||
|
length: article.length ?? 0,
|
||||||
|
siteName: article.siteName ?? null,
|
||||||
|
};
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fallbackExtract(html: string, url: string, dom: JSDOM): ExtractedContent {
|
||||||
|
const doc = dom.window.document;
|
||||||
|
const title = doc.title || new URL(url).hostname;
|
||||||
|
const body = doc.body?.textContent ?? "";
|
||||||
|
const truncated = body.slice(0, 5000).trim();
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
content: doc.body?.innerHTML ?? "",
|
||||||
|
textContent: truncated,
|
||||||
|
markdown: this.turndown.turndown(doc.body?.innerHTML ?? ""),
|
||||||
|
excerpt: truncated.slice(0, 200),
|
||||||
|
byline: null,
|
||||||
|
length: truncated.length,
|
||||||
|
siteName: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
toSearchDocument(url: string, extracted: ExtractedContent, topicId?: string) {
|
||||||
|
const domain = new URL(url).hostname;
|
||||||
|
return {
|
||||||
|
id: this.urlToId(url),
|
||||||
|
url,
|
||||||
|
title: extracted.title,
|
||||||
|
content: extracted.textContent.slice(0, 50_000), // Limit for search indexing
|
||||||
|
excerpt: extracted.excerpt,
|
||||||
|
markdown: extracted.markdown.slice(0, 50_000),
|
||||||
|
domain,
|
||||||
|
topicId: topicId ?? null,
|
||||||
|
fetchedAt: new Date().toISOString(),
|
||||||
|
byline: extracted.byline,
|
||||||
|
siteName: extracted.siteName,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private urlToId(url: string): string {
|
||||||
|
const { createHash } = require("node:crypto");
|
||||||
|
return createHash("sha256").update(url).digest("hex").slice(0, 24);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,7 +1,10 @@
|
|||||||
/**
|
/**
|
||||||
* @file index
|
* @file index
|
||||||
* @description Web crawling and indexing engine
|
* @description Web crawling, content extraction, and search indexing engine
|
||||||
* @layer Indexer
|
* @layer Indexer
|
||||||
*/
|
*/
|
||||||
|
|
||||||
export { type TopicConfig } from "@webproxy/shared";
|
export { TopicCrawler } from "./crawler.js";
|
||||||
|
export { ContentExtractor } from "./extractor.js";
|
||||||
|
export { SearchClient } from "./search-client.js";
|
||||||
|
export { CrawlScheduler } from "./scheduler.js";
|
||||||
|
|||||||
184
packages/indexer/src/scheduler.ts
Normal file
184
packages/indexer/src/scheduler.ts
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
/**
|
||||||
|
* @file scheduler
|
||||||
|
* @description Crawl scheduler that runs topic crawls on configured intervals
|
||||||
|
* @layer Indexer
|
||||||
|
*
|
||||||
|
* Manages scheduled crawl jobs for configured topics.
|
||||||
|
* Runs crawls at the interval specified in each topic's schedule.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { type TopicConfig, type CrawlJob } from "@webproxy/shared";
|
||||||
|
import { TopicCrawler, type CrawlCallbacks } from "./crawler.js";
|
||||||
|
|
||||||
|
type ScheduledTopic = {
|
||||||
|
topic: TopicConfig;
|
||||||
|
timer: ReturnType<typeof setInterval> | null;
|
||||||
|
lastRun: Date | null;
|
||||||
|
lastJob: CrawlJob | null;
|
||||||
|
running: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class CrawlScheduler {
|
||||||
|
private crawler: TopicCrawler;
|
||||||
|
private topics = new Map<string, ScheduledTopic>();
|
||||||
|
private callbacks?: CrawlCallbacks;
|
||||||
|
private running = false;
|
||||||
|
|
||||||
|
constructor(crawler: TopicCrawler, callbacks?: CrawlCallbacks) {
|
||||||
|
this.crawler = crawler;
|
||||||
|
this.callbacks = callbacks;
|
||||||
|
}
|
||||||
|
|
||||||
|
addTopic(topic: TopicConfig): void {
|
||||||
|
if (this.topics.has(topic.id)) {
|
||||||
|
this.removeTopic(topic.id);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.topics.set(topic.id, {
|
||||||
|
topic,
|
||||||
|
timer: null,
|
||||||
|
lastRun: null,
|
||||||
|
lastJob: null,
|
||||||
|
running: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (this.running && topic.enabled) {
|
||||||
|
this.scheduleOne(topic.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
removeTopic(topicId: string): void {
|
||||||
|
const scheduled = this.topics.get(topicId);
|
||||||
|
if (scheduled?.timer) {
|
||||||
|
clearInterval(scheduled.timer);
|
||||||
|
}
|
||||||
|
this.topics.delete(topicId);
|
||||||
|
}
|
||||||
|
|
||||||
|
start(): void {
|
||||||
|
this.running = true;
|
||||||
|
|
||||||
|
for (const [id, scheduled] of this.topics) {
|
||||||
|
if (scheduled.topic.enabled) {
|
||||||
|
this.scheduleOne(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`[Scheduler] Started with ${this.topics.size} topics`);
|
||||||
|
}
|
||||||
|
|
||||||
|
stop(): void {
|
||||||
|
this.running = false;
|
||||||
|
|
||||||
|
for (const [, scheduled] of this.topics) {
|
||||||
|
if (scheduled.timer) {
|
||||||
|
clearInterval(scheduled.timer);
|
||||||
|
scheduled.timer = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("[Scheduler] Stopped");
|
||||||
|
}
|
||||||
|
|
||||||
|
async runNow(topicId: string): Promise<CrawlJob | null> {
|
||||||
|
const scheduled = this.topics.get(topicId);
|
||||||
|
if (!scheduled) return null;
|
||||||
|
|
||||||
|
return this.executeCrawl(scheduled);
|
||||||
|
}
|
||||||
|
|
||||||
|
async runAll(): Promise<CrawlJob[]> {
|
||||||
|
const jobs: CrawlJob[] = [];
|
||||||
|
|
||||||
|
for (const [, scheduled] of this.topics) {
|
||||||
|
if (scheduled.topic.enabled && !scheduled.running) {
|
||||||
|
const job = await this.executeCrawl(scheduled);
|
||||||
|
if (job) jobs.push(job);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return jobs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private scheduleOne(topicId: string): void {
|
||||||
|
const scheduled = this.topics.get(topicId);
|
||||||
|
if (!scheduled) return;
|
||||||
|
|
||||||
|
const intervalMs = scheduled.topic.schedule.intervalMinutes * 60 * 1000;
|
||||||
|
|
||||||
|
// Run immediately on first schedule
|
||||||
|
this.executeCrawl(scheduled).catch((err) => {
|
||||||
|
console.error(`[Scheduler] Initial crawl failed for ${topicId}:`, err);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Then on interval
|
||||||
|
scheduled.timer = setInterval(() => {
|
||||||
|
if (!scheduled.running) {
|
||||||
|
this.executeCrawl(scheduled).catch((err) => {
|
||||||
|
console.error(`[Scheduler] Scheduled crawl failed for ${topicId}:`, err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, intervalMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async executeCrawl(scheduled: ScheduledTopic): Promise<CrawlJob | null> {
|
||||||
|
if (scheduled.running) return null;
|
||||||
|
|
||||||
|
scheduled.running = true;
|
||||||
|
scheduled.lastRun = new Date();
|
||||||
|
|
||||||
|
console.log(`[Scheduler] Starting crawl: ${scheduled.topic.name}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const job = await this.crawler.crawlTopic(scheduled.topic, {
|
||||||
|
...this.callbacks,
|
||||||
|
onJobComplete: (job) => {
|
||||||
|
scheduled.lastJob = job;
|
||||||
|
scheduled.running = false;
|
||||||
|
this.callbacks?.onJobComplete?.(job);
|
||||||
|
console.log(
|
||||||
|
`[Scheduler] Crawl complete: ${scheduled.topic.name} - ${job.pagesProcessed} pages, ${job.pagesFailed} failed`,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return job;
|
||||||
|
} catch (err) {
|
||||||
|
scheduled.running = false;
|
||||||
|
console.error(`[Scheduler] Crawl error: ${scheduled.topic.name}`, err);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getStatus() {
|
||||||
|
const topicStatuses = [];
|
||||||
|
|
||||||
|
for (const [id, scheduled] of this.topics) {
|
||||||
|
topicStatuses.push({
|
||||||
|
id,
|
||||||
|
name: scheduled.topic.name,
|
||||||
|
enabled: scheduled.topic.enabled,
|
||||||
|
running: scheduled.running,
|
||||||
|
lastRun: scheduled.lastRun?.toISOString() ?? null,
|
||||||
|
lastJob: scheduled.lastJob
|
||||||
|
? {
|
||||||
|
status: scheduled.lastJob.status,
|
||||||
|
pagesProcessed: scheduled.lastJob.pagesProcessed,
|
||||||
|
pagesFailed: scheduled.lastJob.pagesFailed,
|
||||||
|
}
|
||||||
|
: null,
|
||||||
|
nextRun: scheduled.timer && scheduled.lastRun
|
||||||
|
? new Date(
|
||||||
|
scheduled.lastRun.getTime() +
|
||||||
|
scheduled.topic.schedule.intervalMinutes * 60 * 1000,
|
||||||
|
).toISOString()
|
||||||
|
: null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
running: this.running,
|
||||||
|
topics: topicStatuses,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
141
packages/indexer/src/search-client.ts
Normal file
141
packages/indexer/src/search-client.ts
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
/**
|
||||||
|
* @file search-client
|
||||||
|
* @description MeiliSearch client for full-text search of indexed content
|
||||||
|
* @layer Indexer
|
||||||
|
*
|
||||||
|
* Wraps the MeiliSearch client for indexing crawled content
|
||||||
|
* and searching across the local cache.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { MeiliSearch, type Index } from "meilisearch";
|
||||||
|
import type { SearchQuery, SearchResult } from "@webproxy/shared";
|
||||||
|
|
||||||
|
export type SearchDocument = {
|
||||||
|
id: string;
|
||||||
|
url: string;
|
||||||
|
title: string;
|
||||||
|
content: string;
|
||||||
|
excerpt: string;
|
||||||
|
markdown: string;
|
||||||
|
domain: string;
|
||||||
|
topicId: string | null;
|
||||||
|
fetchedAt: string;
|
||||||
|
byline: string | null;
|
||||||
|
siteName: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
export class SearchClient {
|
||||||
|
private client: MeiliSearch;
|
||||||
|
private indexName: string;
|
||||||
|
private index: Index<SearchDocument> | null = null;
|
||||||
|
private available = false;
|
||||||
|
|
||||||
|
constructor(options: { host: string; apiKey: string; indexName: string }) {
|
||||||
|
this.client = new MeiliSearch({
|
||||||
|
host: options.host,
|
||||||
|
apiKey: options.apiKey || undefined,
|
||||||
|
});
|
||||||
|
this.indexName = options.indexName;
|
||||||
|
}
|
||||||
|
|
||||||
|
async init(): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await this.client.health();
|
||||||
|
this.available = true;
|
||||||
|
|
||||||
|
this.index = this.client.index<SearchDocument>(this.indexName);
|
||||||
|
|
||||||
|
// Configure searchable and filterable attributes
|
||||||
|
await this.index.updateSettings({
|
||||||
|
searchableAttributes: ["title", "content", "excerpt", "url", "domain"],
|
||||||
|
filterableAttributes: ["domain", "topicId", "fetchedAt"],
|
||||||
|
sortableAttributes: ["fetchedAt"],
|
||||||
|
displayedAttributes: ["id", "url", "title", "excerpt", "domain", "topicId", "fetchedAt"],
|
||||||
|
});
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
this.available = false;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async addDocuments(documents: SearchDocument[]): Promise<void> {
|
||||||
|
if (!this.available || !this.index) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.index.addDocuments(documents);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("[SearchClient] Failed to add documents:", err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async addDocument(document: SearchDocument): Promise<void> {
|
||||||
|
return this.addDocuments([document]);
|
||||||
|
}
|
||||||
|
|
||||||
|
async search(query: SearchQuery): Promise<SearchResult[]> {
|
||||||
|
if (!this.available || !this.index) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const filters: string[] = [];
|
||||||
|
if (query.topicId) filters.push(`topicId = "${query.topicId}"`);
|
||||||
|
if (query.domain) filters.push(`domain = "${query.domain}"`);
|
||||||
|
|
||||||
|
const results = await this.index.search(query.query, {
|
||||||
|
limit: query.limit,
|
||||||
|
offset: query.offset,
|
||||||
|
filter: filters.length > 0 ? filters.join(" AND ") : undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
return results.hits.map((hit) => ({
|
||||||
|
url: hit.url,
|
||||||
|
title: hit.title,
|
||||||
|
excerpt: hit.excerpt,
|
||||||
|
content: hit.content ?? "",
|
||||||
|
domain: hit.domain,
|
||||||
|
topicId: hit.topicId,
|
||||||
|
fetchedAt: new Date(hit.fetchedAt),
|
||||||
|
score: 0, // MeiliSearch doesn't expose raw scores
|
||||||
|
}));
|
||||||
|
} catch (err) {
|
||||||
|
console.error("[SearchClient] Search failed:", err);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async deleteByUrl(url: string): Promise<void> {
|
||||||
|
if (!this.available || !this.index) return;
|
||||||
|
|
||||||
|
const { createHash } = require("node:crypto");
|
||||||
|
const id = createHash("sha256").update(url).digest("hex").slice(0, 24);
|
||||||
|
try {
|
||||||
|
await this.index.deleteDocument(id);
|
||||||
|
} catch {
|
||||||
|
// Ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getStats() {
|
||||||
|
if (!this.available || !this.index) {
|
||||||
|
return { available: false, documents: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const stats = await this.index.getStats();
|
||||||
|
return {
|
||||||
|
available: true,
|
||||||
|
documents: stats.numberOfDocuments,
|
||||||
|
isIndexing: stats.isIndexing,
|
||||||
|
};
|
||||||
|
} catch {
|
||||||
|
return { available: false, documents: 0 };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
get isAvailable(): boolean {
|
||||||
|
return this.available;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -7,5 +7,12 @@
|
|||||||
"scripts": {
|
"scripts": {
|
||||||
"typecheck": "tsc --noEmit",
|
"typecheck": "tsc --noEmit",
|
||||||
"clean": "rm -rf dist"
|
"clean": "rm -rf dist"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"zod": "^4.3.6"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^25.3.2",
|
||||||
|
"typescript": "^5.9.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
23
packages/shared/src/config.ts
Normal file
23
packages/shared/src/config.ts
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
/**
|
||||||
|
* @file config
|
||||||
|
* @description Configuration loader
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { readFileSync, existsSync } from "node:fs";
|
||||||
|
import { resolve } from "node:path";
|
||||||
|
import { ProxyConfigSchema, type ProxyConfig } from "./types/config.js";
|
||||||
|
|
||||||
|
export const defaultConfig: ProxyConfig = ProxyConfigSchema.parse({});
|
||||||
|
|
||||||
|
export function loadConfig(configPath?: string): ProxyConfig {
|
||||||
|
const path = configPath ?? resolve(process.cwd(), "webproxy.config.json");
|
||||||
|
|
||||||
|
if (!existsSync(path)) {
|
||||||
|
return defaultConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
const raw = readFileSync(path, "utf-8");
|
||||||
|
const json = JSON.parse(raw);
|
||||||
|
return ProxyConfigSchema.parse(json);
|
||||||
|
}
|
||||||
@ -1,31 +1,26 @@
|
|||||||
/**
|
/**
|
||||||
* @file index
|
* @file index
|
||||||
* @description Shared types and utilities for the webproxy monorepo
|
* @description Shared types, schemas, and utilities for the webproxy monorepo
|
||||||
* @layer Shared
|
* @layer Shared
|
||||||
*/
|
*/
|
||||||
|
|
||||||
export type TopicConfig = {
|
export { TopicConfigSchema, CrawlScheduleSchema } from "./types/topic.js";
|
||||||
id: string;
|
export type { TopicConfig, CrawlSchedule } from "./types/topic.js";
|
||||||
name: string;
|
|
||||||
keywords: string[];
|
|
||||||
urls: string[];
|
|
||||||
crawlInterval: number; // minutes
|
|
||||||
enabled: boolean;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type CachedPage = {
|
export { CachedPageSchema } from "./types/cache.js";
|
||||||
url: string;
|
export type { CachedPage, CacheStats } from "./types/cache.js";
|
||||||
contentHash: string;
|
|
||||||
fetchedAt: Date;
|
|
||||||
expiresAt: Date;
|
|
||||||
contentType: string;
|
|
||||||
size: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type ProxyConfig = {
|
export { ProxyConfigSchema, ServerConfigSchema } from "./types/config.js";
|
||||||
port: number;
|
export type { ProxyConfig, ServerConfig } from "./types/config.js";
|
||||||
hostname: string;
|
|
||||||
cacheDir: string;
|
export { CrawlJobSchema } from "./types/crawl.js";
|
||||||
maxCacheSize: number; // bytes
|
export type { CrawlJob, CrawlJobStatus, CrawlResult } from "./types/crawl.js";
|
||||||
topics: TopicConfig[];
|
|
||||||
};
|
export { SearchQuerySchema } from "./types/search.js";
|
||||||
|
export type { SearchQuery, SearchResult } from "./types/search.js";
|
||||||
|
|
||||||
|
export type { LogEntry } from "./types/log.js";
|
||||||
|
export { type LogLevel } from "./types/log.js";
|
||||||
|
|
||||||
|
export { createId, formatBytes, normalizeUrl, isValidUrl, sleep } from "./utils.js";
|
||||||
|
export { loadConfig, defaultConfig } from "./config.js";
|
||||||
|
|||||||
31
packages/shared/src/types/cache.ts
Normal file
31
packages/shared/src/types/cache.ts
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
/**
|
||||||
|
* @file cache
|
||||||
|
* @description Cached page types and schemas
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export const CachedPageSchema = z.object({
|
||||||
|
url: z.string().url(),
|
||||||
|
contentHash: z.string(),
|
||||||
|
fetchedAt: z.coerce.date(),
|
||||||
|
expiresAt: z.coerce.date(),
|
||||||
|
contentType: z.string(),
|
||||||
|
statusCode: z.number(),
|
||||||
|
headers: z.record(z.string(), z.string()).default(() => ({})),
|
||||||
|
size: z.number(),
|
||||||
|
warcFile: z.string().optional(),
|
||||||
|
topicId: z.string().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type CachedPage = z.infer<typeof CachedPageSchema>;
|
||||||
|
|
||||||
|
export type CacheStats = {
|
||||||
|
totalPages: number;
|
||||||
|
totalSize: number;
|
||||||
|
oldestEntry: Date | null;
|
||||||
|
newestEntry: Date | null;
|
||||||
|
hitRate: number;
|
||||||
|
missRate: number;
|
||||||
|
};
|
||||||
73
packages/shared/src/types/config.ts
Normal file
73
packages/shared/src/types/config.ts
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/**
|
||||||
|
* @file config
|
||||||
|
* @description Application configuration types and schemas
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { z } from "zod";
|
||||||
|
import { TopicConfigSchema } from "./topic.js";
|
||||||
|
|
||||||
|
export const ServerConfigSchema = z.object({
|
||||||
|
host: z.string().default("0.0.0.0"),
|
||||||
|
port: z.number().min(1).max(65535).default(8080),
|
||||||
|
adminPort: z.number().min(1).max(65535).default(8081),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type ServerConfig = z.infer<typeof ServerConfigSchema>;
|
||||||
|
|
||||||
|
export const ProxyConfigSchema = z.object({
|
||||||
|
server: ServerConfigSchema.default(() => ({
|
||||||
|
host: "0.0.0.0",
|
||||||
|
port: 8080,
|
||||||
|
adminPort: 8081,
|
||||||
|
})),
|
||||||
|
cache: z.object({
|
||||||
|
dir: z.string().default("./data/cache"),
|
||||||
|
maxSizeBytes: z.number().default(10 * 1024 * 1024 * 1024),
|
||||||
|
maxAge: z.number().default(24 * 60 * 60 * 1000),
|
||||||
|
cleanupIntervalMs: z.number().default(60 * 60 * 1000),
|
||||||
|
}).default(() => ({
|
||||||
|
dir: "./data/cache",
|
||||||
|
maxSizeBytes: 10 * 1024 * 1024 * 1024,
|
||||||
|
maxAge: 24 * 60 * 60 * 1000,
|
||||||
|
cleanupIntervalMs: 60 * 60 * 1000,
|
||||||
|
})),
|
||||||
|
warc: z.object({
|
||||||
|
dir: z.string().default("./data/warc"),
|
||||||
|
maxFileSize: z.number().default(1024 * 1024 * 1024),
|
||||||
|
compress: z.boolean().default(true),
|
||||||
|
}).default(() => ({
|
||||||
|
dir: "./data/warc",
|
||||||
|
maxFileSize: 1024 * 1024 * 1024,
|
||||||
|
compress: true,
|
||||||
|
})),
|
||||||
|
search: z.object({
|
||||||
|
host: z.string().default("http://localhost:7700"),
|
||||||
|
apiKey: z.string().default(""),
|
||||||
|
indexName: z.string().default("webproxy-pages"),
|
||||||
|
}).default(() => ({
|
||||||
|
host: "http://localhost:7700",
|
||||||
|
apiKey: "",
|
||||||
|
indexName: "webproxy-pages",
|
||||||
|
})),
|
||||||
|
topics: z.array(TopicConfigSchema).default(() => []),
|
||||||
|
proxy: z.object({
|
||||||
|
timeout: z.number().default(30000),
|
||||||
|
followRedirects: z.boolean().default(true),
|
||||||
|
maxRedirects: z.number().default(5),
|
||||||
|
allowedPorts: z.array(z.number()).default(() => [80, 443, 8080, 8443]),
|
||||||
|
}).default(() => ({
|
||||||
|
timeout: 30000,
|
||||||
|
followRedirects: true,
|
||||||
|
maxRedirects: 5,
|
||||||
|
allowedPorts: [80, 443, 8080, 8443],
|
||||||
|
})),
|
||||||
|
logging: z.object({
|
||||||
|
level: z.enum(["debug", "info", "warn", "error"]).default("info"),
|
||||||
|
file: z.string().optional(),
|
||||||
|
}).default(() => ({
|
||||||
|
level: "info" as const,
|
||||||
|
})),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type ProxyConfig = z.infer<typeof ProxyConfigSchema>;
|
||||||
37
packages/shared/src/types/crawl.ts
Normal file
37
packages/shared/src/types/crawl.ts
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
/**
|
||||||
|
* @file crawl
|
||||||
|
* @description Crawl job types and schemas
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export type CrawlJobStatus = "pending" | "running" | "completed" | "failed" | "cancelled";
|
||||||
|
|
||||||
|
export const CrawlJobSchema = z.object({
|
||||||
|
id: z.string(),
|
||||||
|
topicId: z.string(),
|
||||||
|
status: z.enum(["pending", "running", "completed", "failed", "cancelled"]).default("pending"),
|
||||||
|
startedAt: z.coerce.date().optional(),
|
||||||
|
completedAt: z.coerce.date().optional(),
|
||||||
|
pagesProcessed: z.number().default(0),
|
||||||
|
pagesFailed: z.number().default(0),
|
||||||
|
bytesDownloaded: z.number().default(0),
|
||||||
|
error: z.string().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type CrawlJob = z.infer<typeof CrawlJobSchema>;
|
||||||
|
|
||||||
|
export type CrawlResult = {
|
||||||
|
url: string;
|
||||||
|
statusCode: number;
|
||||||
|
contentType: string;
|
||||||
|
content: string;
|
||||||
|
rawHtml: string;
|
||||||
|
title: string;
|
||||||
|
excerpt: string;
|
||||||
|
byline: string | null;
|
||||||
|
fetchedAt: Date;
|
||||||
|
headers: Record<string, string>;
|
||||||
|
size: number;
|
||||||
|
};
|
||||||
15
packages/shared/src/types/log.ts
Normal file
15
packages/shared/src/types/log.ts
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/**
|
||||||
|
* @file log
|
||||||
|
* @description Logging types
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
export type LogLevel = "debug" | "info" | "warn" | "error";
|
||||||
|
|
||||||
|
export type LogEntry = {
|
||||||
|
level: LogLevel;
|
||||||
|
message: string;
|
||||||
|
timestamp: Date;
|
||||||
|
context?: string;
|
||||||
|
data?: Record<string, unknown>;
|
||||||
|
};
|
||||||
30
packages/shared/src/types/search.ts
Normal file
30
packages/shared/src/types/search.ts
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
/**
|
||||||
|
* @file search
|
||||||
|
* @description Search types and schemas
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export const SearchQuerySchema = z.object({
|
||||||
|
query: z.string().min(1),
|
||||||
|
limit: z.number().min(1).max(100).default(20),
|
||||||
|
offset: z.number().min(0).default(0),
|
||||||
|
topicId: z.string().optional(),
|
||||||
|
domain: z.string().optional(),
|
||||||
|
dateFrom: z.coerce.date().optional(),
|
||||||
|
dateTo: z.coerce.date().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type SearchQuery = z.infer<typeof SearchQuerySchema>;
|
||||||
|
|
||||||
|
export type SearchResult = {
|
||||||
|
url: string;
|
||||||
|
title: string;
|
||||||
|
excerpt: string;
|
||||||
|
content: string;
|
||||||
|
domain: string;
|
||||||
|
topicId: string | null;
|
||||||
|
fetchedAt: Date;
|
||||||
|
score: number;
|
||||||
|
};
|
||||||
38
packages/shared/src/types/topic.ts
Normal file
38
packages/shared/src/types/topic.ts
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/**
|
||||||
|
* @file topic
|
||||||
|
* @description Topic configuration types and schemas
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export const CrawlScheduleSchema = z.object({
|
||||||
|
intervalMinutes: z.number().min(1).default(60),
|
||||||
|
maxPagesPerCrawl: z.number().min(1).default(100),
|
||||||
|
maxDepth: z.number().min(0).default(3),
|
||||||
|
respectRobotsTxt: z.boolean().default(true),
|
||||||
|
userAgent: z.string().default("WebProxy/0.1"),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type CrawlSchedule = z.infer<typeof CrawlScheduleSchema>;
|
||||||
|
|
||||||
|
export const TopicConfigSchema = z.object({
|
||||||
|
id: z.string(),
|
||||||
|
name: z.string().min(1),
|
||||||
|
keywords: z.array(z.string()).default(() => []),
|
||||||
|
seedUrls: z.array(z.string().url()).default(() => []),
|
||||||
|
allowedDomains: z.array(z.string()).default(() => []),
|
||||||
|
blockedDomains: z.array(z.string()).default(() => []),
|
||||||
|
schedule: CrawlScheduleSchema.default(() => ({
|
||||||
|
intervalMinutes: 60,
|
||||||
|
maxPagesPerCrawl: 100,
|
||||||
|
maxDepth: 3,
|
||||||
|
respectRobotsTxt: true,
|
||||||
|
userAgent: "WebProxy/0.1",
|
||||||
|
})),
|
||||||
|
enabled: z.boolean().default(true),
|
||||||
|
createdAt: z.coerce.date().default(() => new Date()),
|
||||||
|
updatedAt: z.coerce.date().default(() => new Date()),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type TopicConfig = z.infer<typeof TopicConfigSchema>;
|
||||||
57
packages/shared/src/utils.ts
Normal file
57
packages/shared/src/utils.ts
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/**
|
||||||
|
* @file utils
|
||||||
|
* @description Shared utility functions
|
||||||
|
* @layer Shared
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { randomBytes } from "node:crypto";
|
||||||
|
|
||||||
|
export function createId(prefix = ""): string {
|
||||||
|
const id = randomBytes(12).toString("hex");
|
||||||
|
return prefix ? `${prefix}_${id}` : id;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function formatBytes(bytes: number): string {
|
||||||
|
if (bytes === 0) return "0 B";
|
||||||
|
const units = ["B", "KB", "MB", "GB", "TB"];
|
||||||
|
const i = Math.floor(Math.log(bytes) / Math.log(1024));
|
||||||
|
const value = bytes / Math.pow(1024, i);
|
||||||
|
return `${value.toFixed(i > 0 ? 1 : 0)} ${units[i]}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function normalizeUrl(url: string): string {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(url);
|
||||||
|
parsed.hash = "";
|
||||||
|
// Remove trailing slash for non-root paths
|
||||||
|
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
|
||||||
|
parsed.pathname = parsed.pathname.slice(0, -1);
|
||||||
|
}
|
||||||
|
// Sort query params for consistent caching
|
||||||
|
parsed.searchParams.sort();
|
||||||
|
return parsed.toString();
|
||||||
|
} catch {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isValidUrl(url: string): boolean {
|
||||||
|
try {
|
||||||
|
new URL(url);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractDomain(url: string): string {
|
||||||
|
try {
|
||||||
|
return new URL(url).hostname;
|
||||||
|
} catch {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
3016
pnpm-lock.yaml
generated
3016
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
50
webproxy.config.json
Normal file
50
webproxy.config.json
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
{
|
||||||
|
"server": {
|
||||||
|
"host": "0.0.0.0",
|
||||||
|
"port": 8080,
|
||||||
|
"adminPort": 8081
|
||||||
|
},
|
||||||
|
"cache": {
|
||||||
|
"dir": "./data/cache",
|
||||||
|
"maxSizeBytes": 10737418240,
|
||||||
|
"maxAge": 86400000,
|
||||||
|
"cleanupIntervalMs": 3600000
|
||||||
|
},
|
||||||
|
"warc": {
|
||||||
|
"dir": "./data/warc",
|
||||||
|
"maxFileSize": 1073741824,
|
||||||
|
"compress": false
|
||||||
|
},
|
||||||
|
"search": {
|
||||||
|
"host": "http://localhost:7700",
|
||||||
|
"apiKey": "",
|
||||||
|
"indexName": "webproxy-pages"
|
||||||
|
},
|
||||||
|
"topics": [
|
||||||
|
{
|
||||||
|
"id": "topic_example",
|
||||||
|
"name": "Example Topic",
|
||||||
|
"keywords": ["typescript", "node.js"],
|
||||||
|
"seedUrls": ["https://nodejs.org/en", "https://www.typescriptlang.org/docs"],
|
||||||
|
"allowedDomains": ["nodejs.org", "typescriptlang.org"],
|
||||||
|
"blockedDomains": [],
|
||||||
|
"schedule": {
|
||||||
|
"intervalMinutes": 360,
|
||||||
|
"maxPagesPerCrawl": 50,
|
||||||
|
"maxDepth": 2,
|
||||||
|
"respectRobotsTxt": true,
|
||||||
|
"userAgent": "WebProxy/0.1"
|
||||||
|
},
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"proxy": {
|
||||||
|
"timeout": 30000,
|
||||||
|
"followRedirects": true,
|
||||||
|
"maxRedirects": 5,
|
||||||
|
"allowedPorts": [80, 443, 8080, 8443]
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"level": "info"
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user