User checkpoint: Add Redis to Replit and implement newsletter scraping functionality.

This commit is contained in:
TerribleDev
2025-02-15 18:55:18 +00:00
parent cda3af06df
commit a3d098b4f4
3 changed files with 248 additions and 2 deletions

View File

@@ -14,8 +14,11 @@ run = ["npm", "run", "start"]
localPort = 5000 localPort = 5000
externalPort = 80 externalPort = 80
[[ports]]
localPort = 6379
externalPort = 3000
[workflows] [workflows]
runButton = "Project"
[[workflows.workflow]] [[workflows.workflow]]
name = "Project" name = "Project"
@@ -40,3 +43,6 @@ task = "packager.installForAll"
task = "shell.exec" task = "shell.exec"
args = "npm run dev" args = "npm run dev"
waitForPort = 5000 waitForPort = 5000
[[processes]]
redis = { command = "redis-server", start_at = true }

240
dist2/queue.js Normal file
View File

@@ -0,0 +1,240 @@
var __defProp = Object.defineProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
// server/queue.ts
import Queue from "bull";
// server/utils.ts
import axios from "axios";
import * as cheerio from "cheerio";
var ROBLY_ARCHIVE_URL = "https://app.robly.com/public/archives?a=b31b32385b5904b5";
async function scrapeNewsletterContent(url) {
try {
const { data } = await axios.get(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"
},
timeout: 15e3
});
const $ = cheerio.load(data);
const images = $("img").toArray();
const thumbnailUrl = images.length > 1 ? $(images[1]).attr("src") : null;
const content = $("body").text().trim();
return {
thumbnail: thumbnailUrl,
content
};
} catch (error) {
console.warn("Error scraping newsletter content:", error);
return { thumbnail: null, content: null };
}
}
async function scrapeNewsletters() {
try {
const { data } = await axios.get(ROBLY_ARCHIVE_URL, {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"
},
timeout: 1e4
});
const $ = cheerio.load(data);
const newsletters2 = [];
const links = $('a[href^="/archive?id="]');
console.log(`Found ${links.length} newsletter links`);
for (const element of links.toArray()) {
const $element = $(element);
const url = $element.attr("href");
const fullText = $element.parent().text().trim();
const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/);
if (match && url) {
const [, dateStr, title] = match;
try {
const date2 = new Date(dateStr).toISOString().split("T")[0];
const fullUrl = `https://app.robly.com${url}`;
const { thumbnail, content } = await scrapeNewsletterContent(fullUrl);
newsletters2.push({
title: title.trim(),
date: date2,
url: fullUrl,
thumbnail,
content,
description: content ? content.slice(0, 200) + "..." : null
});
console.log(`Processed newsletter: ${title}`);
} catch (err) {
console.warn("Error processing date for newsletter:", { dateStr, title }, err);
}
}
}
if (newsletters2.length === 0) {
console.error("No newsletters found in HTML. First 500 chars of response:", data.slice(0, 500));
throw new Error("No newsletters found in the archive");
}
console.log(`Successfully scraped ${newsletters2.length} newsletters`);
return newsletters2;
} catch (error) {
console.error("Error scraping newsletters:", error);
if (axios.isAxiosError(error)) {
console.error("Axios error details:", {
status: error.response?.status,
statusText: error.response?.statusText,
data: error.response?.data
});
}
throw error;
}
}
// server/db.ts
import { Pool, neonConfig } from "@neondatabase/serverless";
import { drizzle } from "drizzle-orm/neon-serverless";
import ws from "ws";
// shared/schema.ts
var schema_exports = {};
__export(schema_exports, {
insertNewsletterSchema: () => insertNewsletterSchema,
insertSubscriptionSchema: () => insertSubscriptionSchema,
newsletters: () => newsletters,
subscriptions: () => subscriptions
});
import { pgTable, text, serial, date, timestamp } from "drizzle-orm/pg-core";
import { createInsertSchema } from "drizzle-zod";
var newsletters = pgTable("newsletters", {
id: serial("id").primaryKey(),
title: text("title").notNull(),
date: date("date").notNull(),
url: text("url").notNull(),
description: text("description"),
thumbnail: text("thumbnail"),
content: text("content"),
last_checked: timestamp("last_checked")
});
var insertNewsletterSchema = createInsertSchema(newsletters).pick({
title: true,
date: true,
url: true,
description: true,
thumbnail: true,
content: true
});
var subscriptions = pgTable("subscriptions", {
id: serial("id").primaryKey(),
endpoint: text("endpoint").notNull(),
auth: text("auth").notNull(),
p256dh: text("p256dh").notNull(),
created_at: timestamp("created_at").defaultNow()
});
var insertSubscriptionSchema = createInsertSchema(subscriptions).pick({
endpoint: true,
auth: true,
p256dh: true
});
// server/db.ts
neonConfig.webSocketConstructor = ws;
if (!process.env.DATABASE_URL) {
throw new Error(
"DATABASE_URL must be set. Did you forget to provision a database?"
);
}
var pool = new Pool({ connectionString: process.env.DATABASE_URL });
var db = drizzle({ client: pool, schema: schema_exports });
// server/storage.ts
import { desc, ilike, or } from "drizzle-orm";
var DatabaseStorage = class {
async getNewsletters() {
return await db.select().from(newsletters).orderBy(desc(newsletters.date));
}
async searchNewsletters(query) {
const lowercaseQuery = query.toLowerCase();
return await db.select().from(newsletters).where(
or(
ilike(newsletters.title, `%${lowercaseQuery}%`),
ilike(newsletters.content || "", `%${lowercaseQuery}%`),
ilike(newsletters.description || "", `%${lowercaseQuery}%`)
)
).orderBy(desc(newsletters.date));
}
async importNewsletters(newNewsletters) {
const batchSize = 50;
for (let i = 0; i < newNewsletters.length; i += batchSize) {
const batch = newNewsletters.slice(i, i + batchSize);
await db.insert(newsletters).values(batch);
}
}
async addSubscription(subscription) {
await db.insert(subscriptions).values(subscription);
}
async getSubscriptions() {
return await db.select().from(subscriptions);
}
};
var storage = new DatabaseStorage();
// server/queue.ts
import webpush from "web-push";
var REDIS_URL = process.env.REPLIT_REDIS_URL || "redis://localhost:6379";
var newsletterQueue = new Queue("newsletter-updates", REDIS_URL);
newsletterQueue.process(async (job) => {
console.log("Processing newsletter update job...");
try {
const existingNewsletters = await storage.getNewsletters();
const scrapedNewsletters = await scrapeNewsletters();
const newNewsletters = scrapedNewsletters.filter(
(scraped) => !existingNewsletters.some(
(existing) => existing.url === scraped.url
)
);
if (newNewsletters.length > 0) {
await storage.importNewsletters(newNewsletters);
console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`);
const subscriptions2 = await storage.getSubscriptions();
console.log(`Sending notifications to ${subscriptions2.length} subscribers`);
const notificationPayload = JSON.stringify({
title: "New Newsletters Available",
body: `${newNewsletters.length} new newsletter${newNewsletters.length > 1 ? "s" : ""} published!`,
icon: "/icon.png"
});
const results = await Promise.allSettled(
subscriptions2.map(
(subscription) => webpush.sendNotification({
endpoint: subscription.endpoint,
keys: {
auth: subscription.auth,
p256dh: subscription.p256dh
}
}, notificationPayload)
)
);
const succeeded = results.filter((r) => r.status === "fulfilled").length;
const failed = results.filter((r) => r.status === "rejected").length;
console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`);
} else {
console.log("No new newsletters found");
}
} catch (error) {
console.error("Queue job failed:", error);
throw error;
}
});
newsletterQueue.on("error", (error) => {
console.error("Queue error:", error);
});
newsletterQueue.on("completed", (job) => {
console.log(`Job ${job.id} completed successfully`);
});
newsletterQueue.on("failed", (job, error) => {
console.error(`Job ${job.id} failed:`, error);
});
export {
newsletterQueue
};

View File

@@ -5,7 +5,7 @@
"license": "MIT", "license": "MIT",
"scripts": { "scripts": {
"dev": "tsx server/index.ts", "dev": "tsx server/index.ts",
"build": "vite build && esbuild server/index.ts --platform=node --packages=external --bundle --format=esm --outdir=dist", "build": "vite build && esbuild server/index.ts --platform=node --packages=external --bundle --format=esm --outdir=dist2",
"start": "NODE_ENV=production node dist/index.js", "start": "NODE_ENV=production node dist/index.js",
"check": "tsc" "check": "tsc"
}, },