From 38a1dfea980c4a67f9c1ecef608c76f1509009f8 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Sun, 19 May 2024 15:08:22 +0800 Subject: [PATCH] fix: web content scraper add proxy --- util/agent/web_searcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/util/agent/web_searcher.py b/util/agent/web_searcher.py index d1a1e309..e07f8084 100644 --- a/util/agent/web_searcher.py +++ b/util/agent/web_searcher.py @@ -3,6 +3,7 @@ import random import json import asyncio import aiohttp +import os from readability import Document from bs4 import BeautifulSoup @@ -22,6 +23,7 @@ logger: Logger = LogManager.GetLogger(log_name='astrbot-core') bing_search = Bing() sogo_search = Sogo() google = Google() +proxy = os.environ.get("HTTPS_PROXY", None) def tidy_text(text: str) -> str: ''' @@ -80,7 +82,7 @@ async def search_from_bing(keyword: str) -> str: except: site_result = "" site_result = site_result[:600] + "..." if len(site_result) > 600 else site_result - ret += f"{idx}. {i.title}\n{site_result}\n\n" + ret += f"{idx}. {i.title} \n{i.snippet}\n{site_result}\n\n" idx += 1 return ret @@ -89,7 +91,7 @@ async def fetch_website_content(url): header = HEADERS header.update({'User-Agent': random.choice(USER_AGENTS)}) async with aiohttp.ClientSession() as session: - async with session.get(url, headers=HEADERS, timeout=6) as response: + async with session.get(url, headers=HEADERS, timeout=6, proxy=proxy) as response: html = await response.text(encoding="utf-8") doc = Document(html) ret = doc.summary(html_partial=True)