Agent query: I've added the hasDetails tracking feature. Could you check if the newsletters are being displayed correctly and if the content is being fetched properly?

Enhance newsletter scraping: Add retry mechanism for missing details and update database accordingly.

Screenshot: https://storage.googleapis.com/screenshot-production-us-central1/9dda30b6-4149-4bce-89dc-76333005952c/d96ca1cd-ef9a-4ef9-ae09-bfd9504e8440.jpg
This commit is contained in:
TerribleDev
2025-02-18 16:30:13 +00:00
parent ace2c97e32
commit 10828e5952
5 changed files with 107 additions and 46 deletions

5
replit.nix Normal file
View File

@@ -0,0 +1,5 @@
{pkgs}: {
deps = [
pkgs.postgresql
];
}

View File

@@ -1,7 +1,7 @@
import type { Express } from "express";
import { createServer, type Server } from "http";
import { storage } from "./storage";
import { scrapeNewsletters } from "./utils";
import { scrapeNewsletters, retryMissingDetails } from "./utils";
import { Feed } from "feed";
import webpush from "web-push";
import schedule from "node-schedule";
@@ -27,6 +27,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
const existingNewsletters = await storage.getNewsletters();
const scrapedNewsletters = await scrapeNewsletters();
// Import new newsletters
const newNewsletters = scrapedNewsletters.filter(scraped =>
!existingNewsletters.some(existing =>
existing.url === scraped.url
@@ -37,7 +38,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
await storage.importNewsletters(newNewsletters);
console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`);
// Send push notifications
// Send push notifications for new newsletters
const subscriptions = await storage.getActiveSubscriptions();
console.log(`Sending notifications to ${subscriptions.length} subscribers`);
@@ -47,20 +48,6 @@ export async function registerRoutes(app: Express): Promise<Server> {
icon: '/icon.png'
});
app.post("/api/subscriptions/:id/settings", async (req, res) => {
try {
const subscriptionId = parseInt(req.params.id);
await storage.saveNotificationSettings(subscriptionId, {
newsletter_notifications: req.body.newsletter_notifications
});
res.json({ message: "Notification settings updated successfully" });
} catch (error) {
console.error('Error updating notification settings:', error);
res.status(500).json({ message: "Failed to update notification settings" });
}
});
const results = await Promise.allSettled(
subscriptions.map(subscription =>
webpush.sendNotification({
@@ -77,6 +64,23 @@ export async function registerRoutes(app: Express): Promise<Server> {
const failed = results.filter(r => r.status === 'rejected').length;
console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`);
}
// Retry fetching details for newsletters without them
const newslettersWithoutDetails = await storage.getNewslettersWithoutDetails();
const updatedNewsletters = await retryMissingDetails(newslettersWithoutDetails);
for (const newsletter of updatedNewsletters) {
if (newsletter.id) {
await storage.updateNewsletterDetails(newsletter.id, {
thumbnail: newsletter.thumbnail,
content: newsletter.content,
description: newsletter.description,
hasDetails: newsletter.hasDetails,
});
console.log(`Updated details for newsletter: ${newsletter.title}`);
}
}
} catch (error) {
console.error('Background job failed:', error);
}
@@ -151,6 +155,19 @@ export async function registerRoutes(app: Express): Promise<Server> {
}
});
app.post("/api/subscriptions/:id/settings", async (req, res) => {
try {
const subscriptionId = parseInt(req.params.id);
await storage.saveNotificationSettings(subscriptionId, {
newsletter_notifications: req.body.newsletter_notifications
});
res.json({ message: "Notification settings updated successfully" });
} catch (error) {
console.error('Error updating notification settings:', error);
res.status(500).json({ message: "Failed to update notification settings" });
}
});
app.get("/api/rss", async (_req, res) => {
try {
const newsletters = await storage.getNewsletters();

View File

@@ -1,14 +1,16 @@
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription } from "@shared/schema";
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription, newsletters, subscriptions, notificationSettings } from "@shared/schema";
import { db } from "./db";
import { newsletters, subscriptions } from "@shared/schema";
import { desc, ilike, or } from "drizzle-orm";
import { desc, ilike, or, eq } from "drizzle-orm";
export interface IStorage {
getNewsletters(): Promise<Newsletter[]>;
getNewslettersWithoutDetails(): Promise<Newsletter[]>;
searchNewsletters(query: string): Promise<Newsletter[]>;
importNewsletters(newsletters: InsertNewsletter[]): Promise<void>;
updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void>;
addSubscription(subscription: InsertSubscription): Promise<void>;
getSubscriptions(): Promise<Subscription[]>;
getActiveSubscriptions(): Promise<Subscription[]>;
}
export class DatabaseStorage implements IStorage {
@@ -16,6 +18,14 @@ export class DatabaseStorage implements IStorage {
return await db.select().from(newsletters).orderBy(desc(newsletters.date));
}
async getNewslettersWithoutDetails(): Promise<Newsletter[]> {
return await db
.select()
.from(newsletters)
.where(eq(newsletters.hasDetails, false))
.orderBy(desc(newsletters.date));
}
async searchNewsletters(query: string): Promise<Newsletter[]> {
const lowercaseQuery = query.toLowerCase();
return await db
@@ -32,7 +42,6 @@ export class DatabaseStorage implements IStorage {
}
async importNewsletters(newNewsletters: InsertNewsletter[]): Promise<void> {
// Insert in batches to avoid overwhelming the database
const batchSize = 50;
for (let i = 0; i < newNewsletters.length; i += batchSize) {
const batch = newNewsletters.slice(i, i + batchSize);
@@ -40,6 +49,16 @@ export class DatabaseStorage implements IStorage {
}
}
async updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void> {
await db
.update(newsletters)
.set({
...updates,
last_checked: new Date(),
})
.where(eq(newsletters.id, id));
}
async addSubscription(subscription: InsertSubscription): Promise<void> {
await db.insert(subscriptions).values(subscription);
}
@@ -55,24 +74,14 @@ export class DatabaseStorage implements IStorage {
settings: notificationSettings
})
.from(subscriptions)
.leftJoin(notificationSettings, eq(subscriptions.id, notificationSettings.subscription_id))
.leftJoin(
notificationSettings,
eq(subscriptions.id, notificationSettings.subscription_id)
)
.where(eq(notificationSettings.newsletter_notifications, true));
return result.map(r => r.subscription);
}
async saveNotificationSettings(subscriptionId: number, settings: Partial<InsertNotificationSettings>): Promise<void> {
await db
.insert(notificationSettings)
.values({
subscription_id: subscriptionId,
...settings
})
.onConflictDoUpdate({
target: [notificationSettings.subscription_id],
set: settings
});
}
}
export const storage = new DatabaseStorage();

View File

@@ -1,6 +1,6 @@
import axios from "axios";
import * as cheerio from "cheerio";
import type { InsertNewsletter } from "@shared/schema";
import type { InsertNewsletter, Newsletter } from "@shared/schema";
const ROBLY_ARCHIVE_URL =
"https://app.robly.com/public/archives?a=b31b32385b5904b5";
@@ -8,9 +8,9 @@ const ROBLY_ARCHIVE_URL =
async function scrapeNewsletterContent(
url: string,
retryCount = 0,
): Promise<{ thumbnail: string | null; content: string | null }> {
): Promise<{ thumbnail: string | null; content: string | null; hasDetails: boolean }> {
try {
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000); // Exponential backoff capped at 10 seconds
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000);
if (retryCount > 0) {
await new Promise((resolve) => setTimeout(resolve, backoffTime));
}
@@ -44,9 +44,12 @@ async function scrapeNewsletterContent(
// Extract text content
const content = $("body").text().trim();
const hasDetails = !!(content && content.length > 0);
return {
thumbnail: thumbnailUrl,
content,
hasDetails,
};
} catch (error: any) {
if (
@@ -59,7 +62,7 @@ async function scrapeNewsletterContent(
return scrapeNewsletterContent(url, retryCount + 1);
}
console.warn("Error scraping newsletter content:", error);
return { thumbnail: null, content: null };
return { thumbnail: null, content: null, hasDetails: false };
}
}
@@ -79,7 +82,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
const $ = cheerio.load(data);
const newsletters: InsertNewsletter[] = [];
// Find all links that start with /archive?id=
const links = $('a[href^="/archive?id="]');
console.log(`Found ${links.length} newsletter links`);
@@ -88,8 +90,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
const url = $element.attr("href");
const fullText = $element.parent().text().trim();
// Extract date and title from the text
// Format is typically: "March 21, 2017 - Title"
const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/);
if (match && url) {
@@ -98,8 +98,7 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
const date = new Date(dateStr).toISOString().split("T")[0];
const fullUrl = `https://app.robly.com${url}`;
// Scrape the newsletter content
const { thumbnail, content } = await scrapeNewsletterContent(fullUrl);
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(fullUrl);
newsletters.push({
title: title.trim(),
@@ -108,9 +107,10 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
thumbnail,
content,
description: content ? content.slice(0, 200) + "..." : null,
hasDetails,
});
console.log(`Processed newsletter: ${title}`);
console.log(`Processed newsletter: ${title} (hasDetails: ${hasDetails})`);
} catch (err) {
console.warn(
"Error processing date for newsletter:",
@@ -143,3 +143,31 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
throw error;
}
}
export async function retryMissingDetails(newsletters: Newsletter[]): Promise<InsertNewsletter[]> {
const newslettersWithoutDetails = newsletters.filter(n => !n.hasDetails);
console.log(`Found ${newslettersWithoutDetails.length} newsletters without details to retry`);
const updatedNewsletters: InsertNewsletter[] = [];
for (const newsletter of newslettersWithoutDetails) {
try {
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(newsletter.url);
if (hasDetails) {
updatedNewsletters.push({
...newsletter,
thumbnail,
content,
description: content ? content.slice(0, 200) + "..." : null,
hasDetails,
});
console.log(`Successfully retrieved details for: ${newsletter.title}`);
}
} catch (error) {
console.error(`Failed to retrieve details for ${newsletter.title}:`, error);
}
}
return updatedNewsletters;
}

View File

@@ -17,6 +17,7 @@ export const newsletters = pgTable("newsletters", {
description: text("description"),
thumbnail: text("thumbnail"),
content: text("content"),
hasDetails: boolean("has_details").default(false),
last_checked: timestamp("last_checked"),
});
@@ -27,6 +28,7 @@ export const insertNewsletterSchema = createInsertSchema(newsletters).pick({
description: true,
thumbnail: true,
content: true,
hasDetails: true,
});
export type InsertNewsletter = z.infer<typeof insertNewsletterSchema>;