Agent query: I've added the hasDetails tracking feature. Could you check if the newsletters are being displayed correctly and if the content is being fetched properly?
Enhance newsletter scraping: Add retry mechanism for missing details and update database accordingly. Screenshot: https://storage.googleapis.com/screenshot-production-us-central1/9dda30b6-4149-4bce-89dc-76333005952c/d96ca1cd-ef9a-4ef9-ae09-bfd9504e8440.jpg
This commit is contained in:
5
replit.nix
Normal file
5
replit.nix
Normal file
@@ -0,0 +1,5 @@
|
||||
{pkgs}: {
|
||||
deps = [
|
||||
pkgs.postgresql
|
||||
];
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { Express } from "express";
|
||||
import { createServer, type Server } from "http";
|
||||
import { storage } from "./storage";
|
||||
import { scrapeNewsletters } from "./utils";
|
||||
import { scrapeNewsletters, retryMissingDetails } from "./utils";
|
||||
import { Feed } from "feed";
|
||||
import webpush from "web-push";
|
||||
import schedule from "node-schedule";
|
||||
@@ -27,6 +27,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
||||
const existingNewsletters = await storage.getNewsletters();
|
||||
const scrapedNewsletters = await scrapeNewsletters();
|
||||
|
||||
// Import new newsletters
|
||||
const newNewsletters = scrapedNewsletters.filter(scraped =>
|
||||
!existingNewsletters.some(existing =>
|
||||
existing.url === scraped.url
|
||||
@@ -37,7 +38,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
||||
await storage.importNewsletters(newNewsletters);
|
||||
console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`);
|
||||
|
||||
// Send push notifications
|
||||
// Send push notifications for new newsletters
|
||||
const subscriptions = await storage.getActiveSubscriptions();
|
||||
console.log(`Sending notifications to ${subscriptions.length} subscribers`);
|
||||
|
||||
@@ -47,20 +48,6 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
||||
icon: '/icon.png'
|
||||
});
|
||||
|
||||
|
||||
app.post("/api/subscriptions/:id/settings", async (req, res) => {
|
||||
try {
|
||||
const subscriptionId = parseInt(req.params.id);
|
||||
await storage.saveNotificationSettings(subscriptionId, {
|
||||
newsletter_notifications: req.body.newsletter_notifications
|
||||
});
|
||||
res.json({ message: "Notification settings updated successfully" });
|
||||
} catch (error) {
|
||||
console.error('Error updating notification settings:', error);
|
||||
res.status(500).json({ message: "Failed to update notification settings" });
|
||||
}
|
||||
});
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
subscriptions.map(subscription =>
|
||||
webpush.sendNotification({
|
||||
@@ -77,6 +64,23 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
||||
const failed = results.filter(r => r.status === 'rejected').length;
|
||||
console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`);
|
||||
}
|
||||
|
||||
// Retry fetching details for newsletters without them
|
||||
const newslettersWithoutDetails = await storage.getNewslettersWithoutDetails();
|
||||
const updatedNewsletters = await retryMissingDetails(newslettersWithoutDetails);
|
||||
|
||||
for (const newsletter of updatedNewsletters) {
|
||||
if (newsletter.id) {
|
||||
await storage.updateNewsletterDetails(newsletter.id, {
|
||||
thumbnail: newsletter.thumbnail,
|
||||
content: newsletter.content,
|
||||
description: newsletter.description,
|
||||
hasDetails: newsletter.hasDetails,
|
||||
});
|
||||
console.log(`Updated details for newsletter: ${newsletter.title}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Background job failed:', error);
|
||||
}
|
||||
@@ -151,6 +155,19 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
||||
}
|
||||
});
|
||||
|
||||
app.post("/api/subscriptions/:id/settings", async (req, res) => {
|
||||
try {
|
||||
const subscriptionId = parseInt(req.params.id);
|
||||
await storage.saveNotificationSettings(subscriptionId, {
|
||||
newsletter_notifications: req.body.newsletter_notifications
|
||||
});
|
||||
res.json({ message: "Notification settings updated successfully" });
|
||||
} catch (error) {
|
||||
console.error('Error updating notification settings:', error);
|
||||
res.status(500).json({ message: "Failed to update notification settings" });
|
||||
}
|
||||
});
|
||||
|
||||
app.get("/api/rss", async (_req, res) => {
|
||||
try {
|
||||
const newsletters = await storage.getNewsletters();
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription } from "@shared/schema";
|
||||
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription, newsletters, subscriptions, notificationSettings } from "@shared/schema";
|
||||
import { db } from "./db";
|
||||
import { newsletters, subscriptions } from "@shared/schema";
|
||||
import { desc, ilike, or } from "drizzle-orm";
|
||||
import { desc, ilike, or, eq } from "drizzle-orm";
|
||||
|
||||
export interface IStorage {
|
||||
getNewsletters(): Promise<Newsletter[]>;
|
||||
getNewslettersWithoutDetails(): Promise<Newsletter[]>;
|
||||
searchNewsletters(query: string): Promise<Newsletter[]>;
|
||||
importNewsletters(newsletters: InsertNewsletter[]): Promise<void>;
|
||||
updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void>;
|
||||
addSubscription(subscription: InsertSubscription): Promise<void>;
|
||||
getSubscriptions(): Promise<Subscription[]>;
|
||||
getActiveSubscriptions(): Promise<Subscription[]>;
|
||||
}
|
||||
|
||||
export class DatabaseStorage implements IStorage {
|
||||
@@ -16,6 +18,14 @@ export class DatabaseStorage implements IStorage {
|
||||
return await db.select().from(newsletters).orderBy(desc(newsletters.date));
|
||||
}
|
||||
|
||||
async getNewslettersWithoutDetails(): Promise<Newsletter[]> {
|
||||
return await db
|
||||
.select()
|
||||
.from(newsletters)
|
||||
.where(eq(newsletters.hasDetails, false))
|
||||
.orderBy(desc(newsletters.date));
|
||||
}
|
||||
|
||||
async searchNewsletters(query: string): Promise<Newsletter[]> {
|
||||
const lowercaseQuery = query.toLowerCase();
|
||||
return await db
|
||||
@@ -32,7 +42,6 @@ export class DatabaseStorage implements IStorage {
|
||||
}
|
||||
|
||||
async importNewsletters(newNewsletters: InsertNewsletter[]): Promise<void> {
|
||||
// Insert in batches to avoid overwhelming the database
|
||||
const batchSize = 50;
|
||||
for (let i = 0; i < newNewsletters.length; i += batchSize) {
|
||||
const batch = newNewsletters.slice(i, i + batchSize);
|
||||
@@ -40,6 +49,16 @@ export class DatabaseStorage implements IStorage {
|
||||
}
|
||||
}
|
||||
|
||||
async updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void> {
|
||||
await db
|
||||
.update(newsletters)
|
||||
.set({
|
||||
...updates,
|
||||
last_checked: new Date(),
|
||||
})
|
||||
.where(eq(newsletters.id, id));
|
||||
}
|
||||
|
||||
async addSubscription(subscription: InsertSubscription): Promise<void> {
|
||||
await db.insert(subscriptions).values(subscription);
|
||||
}
|
||||
@@ -55,23 +74,13 @@ export class DatabaseStorage implements IStorage {
|
||||
settings: notificationSettings
|
||||
})
|
||||
.from(subscriptions)
|
||||
.leftJoin(notificationSettings, eq(subscriptions.id, notificationSettings.subscription_id))
|
||||
.leftJoin(
|
||||
notificationSettings,
|
||||
eq(subscriptions.id, notificationSettings.subscription_id)
|
||||
)
|
||||
.where(eq(notificationSettings.newsletter_notifications, true));
|
||||
|
||||
return result.map(r => r.subscription);
|
||||
}
|
||||
|
||||
async saveNotificationSettings(subscriptionId: number, settings: Partial<InsertNotificationSettings>): Promise<void> {
|
||||
await db
|
||||
.insert(notificationSettings)
|
||||
.values({
|
||||
subscription_id: subscriptionId,
|
||||
...settings
|
||||
})
|
||||
.onConflictDoUpdate({
|
||||
target: [notificationSettings.subscription_id],
|
||||
set: settings
|
||||
});
|
||||
return result.map(r => r.subscription);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import type { InsertNewsletter } from "@shared/schema";
|
||||
import type { InsertNewsletter, Newsletter } from "@shared/schema";
|
||||
|
||||
const ROBLY_ARCHIVE_URL =
|
||||
"https://app.robly.com/public/archives?a=b31b32385b5904b5";
|
||||
@@ -8,9 +8,9 @@ const ROBLY_ARCHIVE_URL =
|
||||
async function scrapeNewsletterContent(
|
||||
url: string,
|
||||
retryCount = 0,
|
||||
): Promise<{ thumbnail: string | null; content: string | null }> {
|
||||
): Promise<{ thumbnail: string | null; content: string | null; hasDetails: boolean }> {
|
||||
try {
|
||||
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000); // Exponential backoff capped at 10 seconds
|
||||
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000);
|
||||
if (retryCount > 0) {
|
||||
await new Promise((resolve) => setTimeout(resolve, backoffTime));
|
||||
}
|
||||
@@ -44,9 +44,12 @@ async function scrapeNewsletterContent(
|
||||
// Extract text content
|
||||
const content = $("body").text().trim();
|
||||
|
||||
const hasDetails = !!(content && content.length > 0);
|
||||
|
||||
return {
|
||||
thumbnail: thumbnailUrl,
|
||||
content,
|
||||
hasDetails,
|
||||
};
|
||||
} catch (error: any) {
|
||||
if (
|
||||
@@ -59,7 +62,7 @@ async function scrapeNewsletterContent(
|
||||
return scrapeNewsletterContent(url, retryCount + 1);
|
||||
}
|
||||
console.warn("Error scraping newsletter content:", error);
|
||||
return { thumbnail: null, content: null };
|
||||
return { thumbnail: null, content: null, hasDetails: false };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,7 +82,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
||||
const $ = cheerio.load(data);
|
||||
const newsletters: InsertNewsletter[] = [];
|
||||
|
||||
// Find all links that start with /archive?id=
|
||||
const links = $('a[href^="/archive?id="]');
|
||||
console.log(`Found ${links.length} newsletter links`);
|
||||
|
||||
@@ -88,8 +90,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
||||
const url = $element.attr("href");
|
||||
const fullText = $element.parent().text().trim();
|
||||
|
||||
// Extract date and title from the text
|
||||
// Format is typically: "March 21, 2017 - Title"
|
||||
const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/);
|
||||
|
||||
if (match && url) {
|
||||
@@ -98,8 +98,7 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
||||
const date = new Date(dateStr).toISOString().split("T")[0];
|
||||
const fullUrl = `https://app.robly.com${url}`;
|
||||
|
||||
// Scrape the newsletter content
|
||||
const { thumbnail, content } = await scrapeNewsletterContent(fullUrl);
|
||||
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(fullUrl);
|
||||
|
||||
newsletters.push({
|
||||
title: title.trim(),
|
||||
@@ -108,9 +107,10 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
||||
thumbnail,
|
||||
content,
|
||||
description: content ? content.slice(0, 200) + "..." : null,
|
||||
hasDetails,
|
||||
});
|
||||
|
||||
console.log(`Processed newsletter: ${title}`);
|
||||
console.log(`Processed newsletter: ${title} (hasDetails: ${hasDetails})`);
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
"Error processing date for newsletter:",
|
||||
@@ -143,3 +143,31 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export async function retryMissingDetails(newsletters: Newsletter[]): Promise<InsertNewsletter[]> {
|
||||
const newslettersWithoutDetails = newsletters.filter(n => !n.hasDetails);
|
||||
console.log(`Found ${newslettersWithoutDetails.length} newsletters without details to retry`);
|
||||
|
||||
const updatedNewsletters: InsertNewsletter[] = [];
|
||||
|
||||
for (const newsletter of newslettersWithoutDetails) {
|
||||
try {
|
||||
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(newsletter.url);
|
||||
|
||||
if (hasDetails) {
|
||||
updatedNewsletters.push({
|
||||
...newsletter,
|
||||
thumbnail,
|
||||
content,
|
||||
description: content ? content.slice(0, 200) + "..." : null,
|
||||
hasDetails,
|
||||
});
|
||||
console.log(`Successfully retrieved details for: ${newsletter.title}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to retrieve details for ${newsletter.title}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
return updatedNewsletters;
|
||||
}
|
||||
@@ -17,6 +17,7 @@ export const newsletters = pgTable("newsletters", {
|
||||
description: text("description"),
|
||||
thumbnail: text("thumbnail"),
|
||||
content: text("content"),
|
||||
hasDetails: boolean("has_details").default(false),
|
||||
last_checked: timestamp("last_checked"),
|
||||
});
|
||||
|
||||
@@ -27,6 +28,7 @@ export const insertNewsletterSchema = createInsertSchema(newsletters).pick({
|
||||
description: true,
|
||||
thumbnail: true,
|
||||
content: true,
|
||||
hasDetails: true,
|
||||
});
|
||||
|
||||
export type InsertNewsletter = z.infer<typeof insertNewsletterSchema>;
|
||||
@@ -63,4 +65,4 @@ export const insertNotificationSettingsSchema =
|
||||
export type InsertNotificationSettings = z.infer<
|
||||
typeof insertNotificationSettingsSchema
|
||||
>;
|
||||
export type NotificationSettings = typeof notificationSettings.$inferSelect;
|
||||
export type NotificationSettings = typeof notificationSettings.$inferSelect;
|
||||
Reference in New Issue
Block a user