Agent query: I've added the hasDetails tracking feature. Could you check if the newsletters are being displayed correctly and if the content is being fetched properly?

Enhance newsletter scraping: Add retry mechanism for missing details and update database accordingly.

Screenshot: https://storage.googleapis.com/screenshot-production-us-central1/9dda30b6-4149-4bce-89dc-76333005952c/d96ca1cd-ef9a-4ef9-ae09-bfd9504e8440.jpg
This commit is contained in:
TerribleDev
2025-02-18 16:30:13 +00:00
parent ace2c97e32
commit 10828e5952
5 changed files with 107 additions and 46 deletions

5
replit.nix Normal file
View File

@@ -0,0 +1,5 @@
{pkgs}: {
deps = [
pkgs.postgresql
];
}

View File

@@ -1,7 +1,7 @@
import type { Express } from "express"; import type { Express } from "express";
import { createServer, type Server } from "http"; import { createServer, type Server } from "http";
import { storage } from "./storage"; import { storage } from "./storage";
import { scrapeNewsletters } from "./utils"; import { scrapeNewsletters, retryMissingDetails } from "./utils";
import { Feed } from "feed"; import { Feed } from "feed";
import webpush from "web-push"; import webpush from "web-push";
import schedule from "node-schedule"; import schedule from "node-schedule";
@@ -27,6 +27,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
const existingNewsletters = await storage.getNewsletters(); const existingNewsletters = await storage.getNewsletters();
const scrapedNewsletters = await scrapeNewsletters(); const scrapedNewsletters = await scrapeNewsletters();
// Import new newsletters
const newNewsletters = scrapedNewsletters.filter(scraped => const newNewsletters = scrapedNewsletters.filter(scraped =>
!existingNewsletters.some(existing => !existingNewsletters.some(existing =>
existing.url === scraped.url existing.url === scraped.url
@@ -37,7 +38,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
await storage.importNewsletters(newNewsletters); await storage.importNewsletters(newNewsletters);
console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`); console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`);
// Send push notifications // Send push notifications for new newsletters
const subscriptions = await storage.getActiveSubscriptions(); const subscriptions = await storage.getActiveSubscriptions();
console.log(`Sending notifications to ${subscriptions.length} subscribers`); console.log(`Sending notifications to ${subscriptions.length} subscribers`);
@@ -47,20 +48,6 @@ export async function registerRoutes(app: Express): Promise<Server> {
icon: '/icon.png' icon: '/icon.png'
}); });
app.post("/api/subscriptions/:id/settings", async (req, res) => {
try {
const subscriptionId = parseInt(req.params.id);
await storage.saveNotificationSettings(subscriptionId, {
newsletter_notifications: req.body.newsletter_notifications
});
res.json({ message: "Notification settings updated successfully" });
} catch (error) {
console.error('Error updating notification settings:', error);
res.status(500).json({ message: "Failed to update notification settings" });
}
});
const results = await Promise.allSettled( const results = await Promise.allSettled(
subscriptions.map(subscription => subscriptions.map(subscription =>
webpush.sendNotification({ webpush.sendNotification({
@@ -77,6 +64,23 @@ export async function registerRoutes(app: Express): Promise<Server> {
const failed = results.filter(r => r.status === 'rejected').length; const failed = results.filter(r => r.status === 'rejected').length;
console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`); console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`);
} }
// Retry fetching details for newsletters without them
const newslettersWithoutDetails = await storage.getNewslettersWithoutDetails();
const updatedNewsletters = await retryMissingDetails(newslettersWithoutDetails);
for (const newsletter of updatedNewsletters) {
if (newsletter.id) {
await storage.updateNewsletterDetails(newsletter.id, {
thumbnail: newsletter.thumbnail,
content: newsletter.content,
description: newsletter.description,
hasDetails: newsletter.hasDetails,
});
console.log(`Updated details for newsletter: ${newsletter.title}`);
}
}
} catch (error) { } catch (error) {
console.error('Background job failed:', error); console.error('Background job failed:', error);
} }
@@ -151,6 +155,19 @@ export async function registerRoutes(app: Express): Promise<Server> {
} }
}); });
app.post("/api/subscriptions/:id/settings", async (req, res) => {
try {
const subscriptionId = parseInt(req.params.id);
await storage.saveNotificationSettings(subscriptionId, {
newsletter_notifications: req.body.newsletter_notifications
});
res.json({ message: "Notification settings updated successfully" });
} catch (error) {
console.error('Error updating notification settings:', error);
res.status(500).json({ message: "Failed to update notification settings" });
}
});
app.get("/api/rss", async (_req, res) => { app.get("/api/rss", async (_req, res) => {
try { try {
const newsletters = await storage.getNewsletters(); const newsletters = await storage.getNewsletters();

View File

@@ -1,14 +1,16 @@
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription } from "@shared/schema"; import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription, newsletters, subscriptions, notificationSettings } from "@shared/schema";
import { db } from "./db"; import { db } from "./db";
import { newsletters, subscriptions } from "@shared/schema"; import { desc, ilike, or, eq } from "drizzle-orm";
import { desc, ilike, or } from "drizzle-orm";
export interface IStorage { export interface IStorage {
getNewsletters(): Promise<Newsletter[]>; getNewsletters(): Promise<Newsletter[]>;
getNewslettersWithoutDetails(): Promise<Newsletter[]>;
searchNewsletters(query: string): Promise<Newsletter[]>; searchNewsletters(query: string): Promise<Newsletter[]>;
importNewsletters(newsletters: InsertNewsletter[]): Promise<void>; importNewsletters(newsletters: InsertNewsletter[]): Promise<void>;
updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void>;
addSubscription(subscription: InsertSubscription): Promise<void>; addSubscription(subscription: InsertSubscription): Promise<void>;
getSubscriptions(): Promise<Subscription[]>; getSubscriptions(): Promise<Subscription[]>;
getActiveSubscriptions(): Promise<Subscription[]>;
} }
export class DatabaseStorage implements IStorage { export class DatabaseStorage implements IStorage {
@@ -16,6 +18,14 @@ export class DatabaseStorage implements IStorage {
return await db.select().from(newsletters).orderBy(desc(newsletters.date)); return await db.select().from(newsletters).orderBy(desc(newsletters.date));
} }
async getNewslettersWithoutDetails(): Promise<Newsletter[]> {
return await db
.select()
.from(newsletters)
.where(eq(newsletters.hasDetails, false))
.orderBy(desc(newsletters.date));
}
async searchNewsletters(query: string): Promise<Newsletter[]> { async searchNewsletters(query: string): Promise<Newsletter[]> {
const lowercaseQuery = query.toLowerCase(); const lowercaseQuery = query.toLowerCase();
return await db return await db
@@ -32,7 +42,6 @@ export class DatabaseStorage implements IStorage {
} }
async importNewsletters(newNewsletters: InsertNewsletter[]): Promise<void> { async importNewsletters(newNewsletters: InsertNewsletter[]): Promise<void> {
// Insert in batches to avoid overwhelming the database
const batchSize = 50; const batchSize = 50;
for (let i = 0; i < newNewsletters.length; i += batchSize) { for (let i = 0; i < newNewsletters.length; i += batchSize) {
const batch = newNewsletters.slice(i, i + batchSize); const batch = newNewsletters.slice(i, i + batchSize);
@@ -40,6 +49,16 @@ export class DatabaseStorage implements IStorage {
} }
} }
async updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void> {
await db
.update(newsletters)
.set({
...updates,
last_checked: new Date(),
})
.where(eq(newsletters.id, id));
}
async addSubscription(subscription: InsertSubscription): Promise<void> { async addSubscription(subscription: InsertSubscription): Promise<void> {
await db.insert(subscriptions).values(subscription); await db.insert(subscriptions).values(subscription);
} }
@@ -55,24 +74,14 @@ export class DatabaseStorage implements IStorage {
settings: notificationSettings settings: notificationSettings
}) })
.from(subscriptions) .from(subscriptions)
.leftJoin(notificationSettings, eq(subscriptions.id, notificationSettings.subscription_id)) .leftJoin(
notificationSettings,
eq(subscriptions.id, notificationSettings.subscription_id)
)
.where(eq(notificationSettings.newsletter_notifications, true)); .where(eq(notificationSettings.newsletter_notifications, true));
return result.map(r => r.subscription); return result.map(r => r.subscription);
} }
async saveNotificationSettings(subscriptionId: number, settings: Partial<InsertNotificationSettings>): Promise<void> {
await db
.insert(notificationSettings)
.values({
subscription_id: subscriptionId,
...settings
})
.onConflictDoUpdate({
target: [notificationSettings.subscription_id],
set: settings
});
}
} }
export const storage = new DatabaseStorage(); export const storage = new DatabaseStorage();

View File

@@ -1,6 +1,6 @@
import axios from "axios"; import axios from "axios";
import * as cheerio from "cheerio"; import * as cheerio from "cheerio";
import type { InsertNewsletter } from "@shared/schema"; import type { InsertNewsletter, Newsletter } from "@shared/schema";
const ROBLY_ARCHIVE_URL = const ROBLY_ARCHIVE_URL =
"https://app.robly.com/public/archives?a=b31b32385b5904b5"; "https://app.robly.com/public/archives?a=b31b32385b5904b5";
@@ -8,9 +8,9 @@ const ROBLY_ARCHIVE_URL =
async function scrapeNewsletterContent( async function scrapeNewsletterContent(
url: string, url: string,
retryCount = 0, retryCount = 0,
): Promise<{ thumbnail: string | null; content: string | null }> { ): Promise<{ thumbnail: string | null; content: string | null; hasDetails: boolean }> {
try { try {
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000); // Exponential backoff capped at 10 seconds const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000);
if (retryCount > 0) { if (retryCount > 0) {
await new Promise((resolve) => setTimeout(resolve, backoffTime)); await new Promise((resolve) => setTimeout(resolve, backoffTime));
} }
@@ -44,9 +44,12 @@ async function scrapeNewsletterContent(
// Extract text content // Extract text content
const content = $("body").text().trim(); const content = $("body").text().trim();
const hasDetails = !!(content && content.length > 0);
return { return {
thumbnail: thumbnailUrl, thumbnail: thumbnailUrl,
content, content,
hasDetails,
}; };
} catch (error: any) { } catch (error: any) {
if ( if (
@@ -59,7 +62,7 @@ async function scrapeNewsletterContent(
return scrapeNewsletterContent(url, retryCount + 1); return scrapeNewsletterContent(url, retryCount + 1);
} }
console.warn("Error scraping newsletter content:", error); console.warn("Error scraping newsletter content:", error);
return { thumbnail: null, content: null }; return { thumbnail: null, content: null, hasDetails: false };
} }
} }
@@ -79,7 +82,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
const $ = cheerio.load(data); const $ = cheerio.load(data);
const newsletters: InsertNewsletter[] = []; const newsletters: InsertNewsletter[] = [];
// Find all links that start with /archive?id=
const links = $('a[href^="/archive?id="]'); const links = $('a[href^="/archive?id="]');
console.log(`Found ${links.length} newsletter links`); console.log(`Found ${links.length} newsletter links`);
@@ -88,8 +90,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
const url = $element.attr("href"); const url = $element.attr("href");
const fullText = $element.parent().text().trim(); const fullText = $element.parent().text().trim();
// Extract date and title from the text
// Format is typically: "March 21, 2017 - Title"
const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/); const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/);
if (match && url) { if (match && url) {
@@ -98,8 +98,7 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
const date = new Date(dateStr).toISOString().split("T")[0]; const date = new Date(dateStr).toISOString().split("T")[0];
const fullUrl = `https://app.robly.com${url}`; const fullUrl = `https://app.robly.com${url}`;
// Scrape the newsletter content const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(fullUrl);
const { thumbnail, content } = await scrapeNewsletterContent(fullUrl);
newsletters.push({ newsletters.push({
title: title.trim(), title: title.trim(),
@@ -108,9 +107,10 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
thumbnail, thumbnail,
content, content,
description: content ? content.slice(0, 200) + "..." : null, description: content ? content.slice(0, 200) + "..." : null,
hasDetails,
}); });
console.log(`Processed newsletter: ${title}`); console.log(`Processed newsletter: ${title} (hasDetails: ${hasDetails})`);
} catch (err) { } catch (err) {
console.warn( console.warn(
"Error processing date for newsletter:", "Error processing date for newsletter:",
@@ -143,3 +143,31 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
throw error; throw error;
} }
} }
export async function retryMissingDetails(newsletters: Newsletter[]): Promise<InsertNewsletter[]> {
const newslettersWithoutDetails = newsletters.filter(n => !n.hasDetails);
console.log(`Found ${newslettersWithoutDetails.length} newsletters without details to retry`);
const updatedNewsletters: InsertNewsletter[] = [];
for (const newsletter of newslettersWithoutDetails) {
try {
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(newsletter.url);
if (hasDetails) {
updatedNewsletters.push({
...newsletter,
thumbnail,
content,
description: content ? content.slice(0, 200) + "..." : null,
hasDetails,
});
console.log(`Successfully retrieved details for: ${newsletter.title}`);
}
} catch (error) {
console.error(`Failed to retrieve details for ${newsletter.title}:`, error);
}
}
return updatedNewsletters;
}

View File

@@ -17,6 +17,7 @@ export const newsletters = pgTable("newsletters", {
description: text("description"), description: text("description"),
thumbnail: text("thumbnail"), thumbnail: text("thumbnail"),
content: text("content"), content: text("content"),
hasDetails: boolean("has_details").default(false),
last_checked: timestamp("last_checked"), last_checked: timestamp("last_checked"),
}); });
@@ -27,6 +28,7 @@ export const insertNewsletterSchema = createInsertSchema(newsletters).pick({
description: true, description: true,
thumbnail: true, thumbnail: true,
content: true, content: true,
hasDetails: true,
}); });
export type InsertNewsletter = z.infer<typeof insertNewsletterSchema>; export type InsertNewsletter = z.infer<typeof insertNewsletterSchema>;