Agent query: I've added the hasDetails tracking feature. Could you check if the newsletters are being displayed correctly and if the content is being fetched properly?
Enhance newsletter scraping: Add retry mechanism for missing details and update database accordingly. Screenshot: https://storage.googleapis.com/screenshot-production-us-central1/9dda30b6-4149-4bce-89dc-76333005952c/d96ca1cd-ef9a-4ef9-ae09-bfd9504e8440.jpg
This commit is contained in:
5
replit.nix
Normal file
5
replit.nix
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{pkgs}: {
|
||||||
|
deps = [
|
||||||
|
pkgs.postgresql
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { Express } from "express";
|
import type { Express } from "express";
|
||||||
import { createServer, type Server } from "http";
|
import { createServer, type Server } from "http";
|
||||||
import { storage } from "./storage";
|
import { storage } from "./storage";
|
||||||
import { scrapeNewsletters } from "./utils";
|
import { scrapeNewsletters, retryMissingDetails } from "./utils";
|
||||||
import { Feed } from "feed";
|
import { Feed } from "feed";
|
||||||
import webpush from "web-push";
|
import webpush from "web-push";
|
||||||
import schedule from "node-schedule";
|
import schedule from "node-schedule";
|
||||||
@@ -27,6 +27,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
|||||||
const existingNewsletters = await storage.getNewsletters();
|
const existingNewsletters = await storage.getNewsletters();
|
||||||
const scrapedNewsletters = await scrapeNewsletters();
|
const scrapedNewsletters = await scrapeNewsletters();
|
||||||
|
|
||||||
|
// Import new newsletters
|
||||||
const newNewsletters = scrapedNewsletters.filter(scraped =>
|
const newNewsletters = scrapedNewsletters.filter(scraped =>
|
||||||
!existingNewsletters.some(existing =>
|
!existingNewsletters.some(existing =>
|
||||||
existing.url === scraped.url
|
existing.url === scraped.url
|
||||||
@@ -37,7 +38,7 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
|||||||
await storage.importNewsletters(newNewsletters);
|
await storage.importNewsletters(newNewsletters);
|
||||||
console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`);
|
console.log(`Found ${newNewsletters.length} new newsletters, sending notifications...`);
|
||||||
|
|
||||||
// Send push notifications
|
// Send push notifications for new newsletters
|
||||||
const subscriptions = await storage.getActiveSubscriptions();
|
const subscriptions = await storage.getActiveSubscriptions();
|
||||||
console.log(`Sending notifications to ${subscriptions.length} subscribers`);
|
console.log(`Sending notifications to ${subscriptions.length} subscribers`);
|
||||||
|
|
||||||
@@ -47,20 +48,6 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
|||||||
icon: '/icon.png'
|
icon: '/icon.png'
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
app.post("/api/subscriptions/:id/settings", async (req, res) => {
|
|
||||||
try {
|
|
||||||
const subscriptionId = parseInt(req.params.id);
|
|
||||||
await storage.saveNotificationSettings(subscriptionId, {
|
|
||||||
newsletter_notifications: req.body.newsletter_notifications
|
|
||||||
});
|
|
||||||
res.json({ message: "Notification settings updated successfully" });
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error updating notification settings:', error);
|
|
||||||
res.status(500).json({ message: "Failed to update notification settings" });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
const results = await Promise.allSettled(
|
const results = await Promise.allSettled(
|
||||||
subscriptions.map(subscription =>
|
subscriptions.map(subscription =>
|
||||||
webpush.sendNotification({
|
webpush.sendNotification({
|
||||||
@@ -77,6 +64,23 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
|||||||
const failed = results.filter(r => r.status === 'rejected').length;
|
const failed = results.filter(r => r.status === 'rejected').length;
|
||||||
console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`);
|
console.log(`Push notifications sent: ${succeeded} succeeded, ${failed} failed`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Retry fetching details for newsletters without them
|
||||||
|
const newslettersWithoutDetails = await storage.getNewslettersWithoutDetails();
|
||||||
|
const updatedNewsletters = await retryMissingDetails(newslettersWithoutDetails);
|
||||||
|
|
||||||
|
for (const newsletter of updatedNewsletters) {
|
||||||
|
if (newsletter.id) {
|
||||||
|
await storage.updateNewsletterDetails(newsletter.id, {
|
||||||
|
thumbnail: newsletter.thumbnail,
|
||||||
|
content: newsletter.content,
|
||||||
|
description: newsletter.description,
|
||||||
|
hasDetails: newsletter.hasDetails,
|
||||||
|
});
|
||||||
|
console.log(`Updated details for newsletter: ${newsletter.title}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Background job failed:', error);
|
console.error('Background job failed:', error);
|
||||||
}
|
}
|
||||||
@@ -151,6 +155,19 @@ export async function registerRoutes(app: Express): Promise<Server> {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.post("/api/subscriptions/:id/settings", async (req, res) => {
|
||||||
|
try {
|
||||||
|
const subscriptionId = parseInt(req.params.id);
|
||||||
|
await storage.saveNotificationSettings(subscriptionId, {
|
||||||
|
newsletter_notifications: req.body.newsletter_notifications
|
||||||
|
});
|
||||||
|
res.json({ message: "Notification settings updated successfully" });
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error updating notification settings:', error);
|
||||||
|
res.status(500).json({ message: "Failed to update notification settings" });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
app.get("/api/rss", async (_req, res) => {
|
app.get("/api/rss", async (_req, res) => {
|
||||||
try {
|
try {
|
||||||
const newsletters = await storage.getNewsletters();
|
const newsletters = await storage.getNewsletters();
|
||||||
|
|||||||
@@ -1,14 +1,16 @@
|
|||||||
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription } from "@shared/schema";
|
import { type Newsletter, type InsertNewsletter, type Subscription, type InsertSubscription, newsletters, subscriptions, notificationSettings } from "@shared/schema";
|
||||||
import { db } from "./db";
|
import { db } from "./db";
|
||||||
import { newsletters, subscriptions } from "@shared/schema";
|
import { desc, ilike, or, eq } from "drizzle-orm";
|
||||||
import { desc, ilike, or } from "drizzle-orm";
|
|
||||||
|
|
||||||
export interface IStorage {
|
export interface IStorage {
|
||||||
getNewsletters(): Promise<Newsletter[]>;
|
getNewsletters(): Promise<Newsletter[]>;
|
||||||
|
getNewslettersWithoutDetails(): Promise<Newsletter[]>;
|
||||||
searchNewsletters(query: string): Promise<Newsletter[]>;
|
searchNewsletters(query: string): Promise<Newsletter[]>;
|
||||||
importNewsletters(newsletters: InsertNewsletter[]): Promise<void>;
|
importNewsletters(newsletters: InsertNewsletter[]): Promise<void>;
|
||||||
|
updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void>;
|
||||||
addSubscription(subscription: InsertSubscription): Promise<void>;
|
addSubscription(subscription: InsertSubscription): Promise<void>;
|
||||||
getSubscriptions(): Promise<Subscription[]>;
|
getSubscriptions(): Promise<Subscription[]>;
|
||||||
|
getActiveSubscriptions(): Promise<Subscription[]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class DatabaseStorage implements IStorage {
|
export class DatabaseStorage implements IStorage {
|
||||||
@@ -16,6 +18,14 @@ export class DatabaseStorage implements IStorage {
|
|||||||
return await db.select().from(newsletters).orderBy(desc(newsletters.date));
|
return await db.select().from(newsletters).orderBy(desc(newsletters.date));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getNewslettersWithoutDetails(): Promise<Newsletter[]> {
|
||||||
|
return await db
|
||||||
|
.select()
|
||||||
|
.from(newsletters)
|
||||||
|
.where(eq(newsletters.hasDetails, false))
|
||||||
|
.orderBy(desc(newsletters.date));
|
||||||
|
}
|
||||||
|
|
||||||
async searchNewsletters(query: string): Promise<Newsletter[]> {
|
async searchNewsletters(query: string): Promise<Newsletter[]> {
|
||||||
const lowercaseQuery = query.toLowerCase();
|
const lowercaseQuery = query.toLowerCase();
|
||||||
return await db
|
return await db
|
||||||
@@ -32,7 +42,6 @@ export class DatabaseStorage implements IStorage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async importNewsletters(newNewsletters: InsertNewsletter[]): Promise<void> {
|
async importNewsletters(newNewsletters: InsertNewsletter[]): Promise<void> {
|
||||||
// Insert in batches to avoid overwhelming the database
|
|
||||||
const batchSize = 50;
|
const batchSize = 50;
|
||||||
for (let i = 0; i < newNewsletters.length; i += batchSize) {
|
for (let i = 0; i < newNewsletters.length; i += batchSize) {
|
||||||
const batch = newNewsletters.slice(i, i + batchSize);
|
const batch = newNewsletters.slice(i, i + batchSize);
|
||||||
@@ -40,6 +49,16 @@ export class DatabaseStorage implements IStorage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async updateNewsletterDetails(id: number, updates: Partial<InsertNewsletter>): Promise<void> {
|
||||||
|
await db
|
||||||
|
.update(newsletters)
|
||||||
|
.set({
|
||||||
|
...updates,
|
||||||
|
last_checked: new Date(),
|
||||||
|
})
|
||||||
|
.where(eq(newsletters.id, id));
|
||||||
|
}
|
||||||
|
|
||||||
async addSubscription(subscription: InsertSubscription): Promise<void> {
|
async addSubscription(subscription: InsertSubscription): Promise<void> {
|
||||||
await db.insert(subscriptions).values(subscription);
|
await db.insert(subscriptions).values(subscription);
|
||||||
}
|
}
|
||||||
@@ -55,24 +74,14 @@ export class DatabaseStorage implements IStorage {
|
|||||||
settings: notificationSettings
|
settings: notificationSettings
|
||||||
})
|
})
|
||||||
.from(subscriptions)
|
.from(subscriptions)
|
||||||
.leftJoin(notificationSettings, eq(subscriptions.id, notificationSettings.subscription_id))
|
.leftJoin(
|
||||||
|
notificationSettings,
|
||||||
|
eq(subscriptions.id, notificationSettings.subscription_id)
|
||||||
|
)
|
||||||
.where(eq(notificationSettings.newsletter_notifications, true));
|
.where(eq(notificationSettings.newsletter_notifications, true));
|
||||||
|
|
||||||
return result.map(r => r.subscription);
|
return result.map(r => r.subscription);
|
||||||
}
|
}
|
||||||
|
|
||||||
async saveNotificationSettings(subscriptionId: number, settings: Partial<InsertNotificationSettings>): Promise<void> {
|
|
||||||
await db
|
|
||||||
.insert(notificationSettings)
|
|
||||||
.values({
|
|
||||||
subscription_id: subscriptionId,
|
|
||||||
...settings
|
|
||||||
})
|
|
||||||
.onConflictDoUpdate({
|
|
||||||
target: [notificationSettings.subscription_id],
|
|
||||||
set: settings
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export const storage = new DatabaseStorage();
|
export const storage = new DatabaseStorage();
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import * as cheerio from "cheerio";
|
import * as cheerio from "cheerio";
|
||||||
import type { InsertNewsletter } from "@shared/schema";
|
import type { InsertNewsletter, Newsletter } from "@shared/schema";
|
||||||
|
|
||||||
const ROBLY_ARCHIVE_URL =
|
const ROBLY_ARCHIVE_URL =
|
||||||
"https://app.robly.com/public/archives?a=b31b32385b5904b5";
|
"https://app.robly.com/public/archives?a=b31b32385b5904b5";
|
||||||
@@ -8,9 +8,9 @@ const ROBLY_ARCHIVE_URL =
|
|||||||
async function scrapeNewsletterContent(
|
async function scrapeNewsletterContent(
|
||||||
url: string,
|
url: string,
|
||||||
retryCount = 0,
|
retryCount = 0,
|
||||||
): Promise<{ thumbnail: string | null; content: string | null }> {
|
): Promise<{ thumbnail: string | null; content: string | null; hasDetails: boolean }> {
|
||||||
try {
|
try {
|
||||||
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000); // Exponential backoff capped at 10 seconds
|
const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 1000);
|
||||||
if (retryCount > 0) {
|
if (retryCount > 0) {
|
||||||
await new Promise((resolve) => setTimeout(resolve, backoffTime));
|
await new Promise((resolve) => setTimeout(resolve, backoffTime));
|
||||||
}
|
}
|
||||||
@@ -44,9 +44,12 @@ async function scrapeNewsletterContent(
|
|||||||
// Extract text content
|
// Extract text content
|
||||||
const content = $("body").text().trim();
|
const content = $("body").text().trim();
|
||||||
|
|
||||||
|
const hasDetails = !!(content && content.length > 0);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
thumbnail: thumbnailUrl,
|
thumbnail: thumbnailUrl,
|
||||||
content,
|
content,
|
||||||
|
hasDetails,
|
||||||
};
|
};
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
if (
|
if (
|
||||||
@@ -59,7 +62,7 @@ async function scrapeNewsletterContent(
|
|||||||
return scrapeNewsletterContent(url, retryCount + 1);
|
return scrapeNewsletterContent(url, retryCount + 1);
|
||||||
}
|
}
|
||||||
console.warn("Error scraping newsletter content:", error);
|
console.warn("Error scraping newsletter content:", error);
|
||||||
return { thumbnail: null, content: null };
|
return { thumbnail: null, content: null, hasDetails: false };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -79,7 +82,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
|||||||
const $ = cheerio.load(data);
|
const $ = cheerio.load(data);
|
||||||
const newsletters: InsertNewsletter[] = [];
|
const newsletters: InsertNewsletter[] = [];
|
||||||
|
|
||||||
// Find all links that start with /archive?id=
|
|
||||||
const links = $('a[href^="/archive?id="]');
|
const links = $('a[href^="/archive?id="]');
|
||||||
console.log(`Found ${links.length} newsletter links`);
|
console.log(`Found ${links.length} newsletter links`);
|
||||||
|
|
||||||
@@ -88,8 +90,6 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
|||||||
const url = $element.attr("href");
|
const url = $element.attr("href");
|
||||||
const fullText = $element.parent().text().trim();
|
const fullText = $element.parent().text().trim();
|
||||||
|
|
||||||
// Extract date and title from the text
|
|
||||||
// Format is typically: "March 21, 2017 - Title"
|
|
||||||
const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/);
|
const match = fullText.match(/^([A-Za-z]+ \d{1,2}, \d{4}) - (.+)$/);
|
||||||
|
|
||||||
if (match && url) {
|
if (match && url) {
|
||||||
@@ -98,8 +98,7 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
|||||||
const date = new Date(dateStr).toISOString().split("T")[0];
|
const date = new Date(dateStr).toISOString().split("T")[0];
|
||||||
const fullUrl = `https://app.robly.com${url}`;
|
const fullUrl = `https://app.robly.com${url}`;
|
||||||
|
|
||||||
// Scrape the newsletter content
|
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(fullUrl);
|
||||||
const { thumbnail, content } = await scrapeNewsletterContent(fullUrl);
|
|
||||||
|
|
||||||
newsletters.push({
|
newsletters.push({
|
||||||
title: title.trim(),
|
title: title.trim(),
|
||||||
@@ -108,9 +107,10 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
|||||||
thumbnail,
|
thumbnail,
|
||||||
content,
|
content,
|
||||||
description: content ? content.slice(0, 200) + "..." : null,
|
description: content ? content.slice(0, 200) + "..." : null,
|
||||||
|
hasDetails,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`Processed newsletter: ${title}`);
|
console.log(`Processed newsletter: ${title} (hasDetails: ${hasDetails})`);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn(
|
console.warn(
|
||||||
"Error processing date for newsletter:",
|
"Error processing date for newsletter:",
|
||||||
@@ -143,3 +143,31 @@ export async function scrapeNewsletters(): Promise<InsertNewsletter[]> {
|
|||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function retryMissingDetails(newsletters: Newsletter[]): Promise<InsertNewsletter[]> {
|
||||||
|
const newslettersWithoutDetails = newsletters.filter(n => !n.hasDetails);
|
||||||
|
console.log(`Found ${newslettersWithoutDetails.length} newsletters without details to retry`);
|
||||||
|
|
||||||
|
const updatedNewsletters: InsertNewsletter[] = [];
|
||||||
|
|
||||||
|
for (const newsletter of newslettersWithoutDetails) {
|
||||||
|
try {
|
||||||
|
const { thumbnail, content, hasDetails } = await scrapeNewsletterContent(newsletter.url);
|
||||||
|
|
||||||
|
if (hasDetails) {
|
||||||
|
updatedNewsletters.push({
|
||||||
|
...newsletter,
|
||||||
|
thumbnail,
|
||||||
|
content,
|
||||||
|
description: content ? content.slice(0, 200) + "..." : null,
|
||||||
|
hasDetails,
|
||||||
|
});
|
||||||
|
console.log(`Successfully retrieved details for: ${newsletter.title}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to retrieve details for ${newsletter.title}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return updatedNewsletters;
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@ export const newsletters = pgTable("newsletters", {
|
|||||||
description: text("description"),
|
description: text("description"),
|
||||||
thumbnail: text("thumbnail"),
|
thumbnail: text("thumbnail"),
|
||||||
content: text("content"),
|
content: text("content"),
|
||||||
|
hasDetails: boolean("has_details").default(false),
|
||||||
last_checked: timestamp("last_checked"),
|
last_checked: timestamp("last_checked"),
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -27,6 +28,7 @@ export const insertNewsletterSchema = createInsertSchema(newsletters).pick({
|
|||||||
description: true,
|
description: true,
|
||||||
thumbnail: true,
|
thumbnail: true,
|
||||||
content: true,
|
content: true,
|
||||||
|
hasDetails: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
export type InsertNewsletter = z.infer<typeof insertNewsletterSchema>;
|
export type InsertNewsletter = z.infer<typeof insertNewsletterSchema>;
|
||||||
|
|||||||
Reference in New Issue
Block a user