From d31bb03b41d2c1c04e71f28535f110422b884ef7 Mon Sep 17 00:00:00 2001 From: TerribleDev <1020010-TerribleDev@users.noreply.replit.com> Date: Sat, 15 Feb 2025 19:53:50 +0000 Subject: [PATCH] User checkpoint: Improve newsletter scraping robustness by adding retry logic and handling AWS WAF and rate limiting. --- server/utils.ts | 97 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/server/utils.ts b/server/utils.ts index 304fa2c..580ebb3 100644 --- a/server/utils.ts +++ b/server/utils.ts @@ -1,35 +1,64 @@ -import axios from 'axios'; -import * as cheerio from 'cheerio'; -import type { InsertNewsletter } from '@shared/schema'; +import axios from "axios"; +import * as cheerio from "cheerio"; +import type { InsertNewsletter } from "@shared/schema"; -const ROBLY_ARCHIVE_URL = 'https://app.robly.com/public/archives?a=b31b32385b5904b5'; +const ROBLY_ARCHIVE_URL = + "https://app.robly.com/public/archives?a=b31b32385b5904b5"; -async function scrapeNewsletterContent(url: string) { +async function scrapeNewsletterContent( + url: string, + retryCount = 0, +): Promise<{ thumbnail: string | null; content: string | null }> { try { + const backoffTime = Math.min(1000 * Math.pow(2, retryCount), 10000); // Exponential backoff capped at 10 seconds + if (retryCount > 0) { + await new Promise((resolve) => setTimeout(resolve, backoffTime)); + } + const { data } = await axios.get(url, { headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", }, - timeout: 15000 + timeout: 15000, }); + if ( + data.includes("AwsWafIntegration.checkForceRefresh") && + retryCount < 3 + ) { + console.log(`AWS WAF detected, waiting before retry ${retryCount + 1}/3`); + await new Promise((resolve) => setTimeout(resolve, 1000)); + return scrapeNewsletterContent(url, retryCount + 1); + } + const $ = cheerio.load(data); // Get the second image as thumbnail - const images = $('img').toArray(); - const thumbnailUrl = images.length > 1 ? $(images[1]).attr('src') : null; + const images = $("img").toArray(); + const thumbnailUrl = images.length > 1 ? $(images[1]).attr("src") : null; // Extract text content - const content = $('body').text().trim(); + const content = $("body").text().trim(); return { thumbnail: thumbnailUrl, - content + content, }; - } catch (error) { - console.warn('Error scraping newsletter content:', error); + } catch (error: any) { + if ( + (error.response?.status === 429 || error.code === "ECONNRESET") && + retryCount < 5 + ) { + console.log( + `Rate limited or connection reset, attempt ${retryCount + 1}/5`, + ); + return scrapeNewsletterContent(url, retryCount + 1); + } + console.warn("Error scraping newsletter content:", error); return { thumbnail: null, content: null }; } } @@ -38,11 +67,13 @@ export async function scrapeNewsletters(): Promise { try { const { data } = await axios.get(ROBLY_ARCHIVE_URL, { headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", }, - timeout: 10000 + timeout: 10000, }); const $ = cheerio.load(data); @@ -54,7 +85,7 @@ export async function scrapeNewsletters(): Promise { for (const element of links.toArray()) { const $element = $(element); - const url = $element.attr('href'); + const url = $element.attr("href"); const fullText = $element.parent().text().trim(); // Extract date and title from the text @@ -64,7 +95,7 @@ export async function scrapeNewsletters(): Promise { if (match && url) { const [, dateStr, title] = match; try { - const date = new Date(dateStr).toISOString().split('T')[0]; + const date = new Date(dateStr).toISOString().split("T")[0]; const fullUrl = `https://app.robly.com${url}`; // Scrape the newsletter content @@ -76,33 +107,39 @@ export async function scrapeNewsletters(): Promise { url: fullUrl, thumbnail, content, - description: content ? content.slice(0, 200) + '...' : null + description: content ? content.slice(0, 200) + "..." : null, }); console.log(`Processed newsletter: ${title}`); } catch (err) { - console.warn('Error processing date for newsletter:', { dateStr, title }, err); + console.warn( + "Error processing date for newsletter:", + { dateStr, title }, + err, + ); } } } if (newsletters.length === 0) { - console.error('No newsletters found in HTML. First 500 chars of response:', data.slice(0, 500)); - throw new Error('No newsletters found in the archive'); + console.error( + "No newsletters found in HTML. First 500 chars of response:", + data.slice(0, 500), + ); + throw new Error("No newsletters found in the archive"); } console.log(`Successfully scraped ${newsletters.length} newsletters`); return newsletters; - } catch (error) { - console.error('Error scraping newsletters:', error); + console.error("Error scraping newsletters:", error); if (axios.isAxiosError(error)) { - console.error('Axios error details:', { + console.error("Axios error details:", { status: error.response?.status, statusText: error.response?.statusText, - data: error.response?.data + data: error.response?.data, }); } throw error; } -} \ No newline at end of file +}