perf: 网页搜索改用google search,是改善效果

This commit is contained in:
Soulter
2023-09-30 14:59:25 +08:00
parent 7832ec386e
commit b9e331ebd6
2 changed files with 47 additions and 8 deletions

View File

@@ -10,3 +10,4 @@ Pillow~=9.4.0
GitPython~=3.1.31
nakuru-project
beautifulsoup4
googlesearch-python

View File

@@ -8,6 +8,8 @@ from util.func_call import (
FuncNotFoundError
)
import traceback
from googlesearch import search, SearchResult
def tidy_text(text: str) -> str:
return text.strip().replace("\n", "").replace(" ", "").replace("\r", "")
@@ -29,6 +31,27 @@ def special_fetch_zhihu(link: str) -> str:
raise Exception("zhihu none")
return tidy_text(r.text)
def google_web_search(keyword) -> str:
# 获取goole搜索结果得到title、desc、link
ret = ""
index = 1
try:
ls = search(keyword, advanced=True, num_results=5)
for i in ls:
desc = i.description
try:
desc = fetch_website_content(i.url)
except BaseException as e:
print(f"(google) fetch_website_content err: {str(e)}")
gu.log(f"# No.{str(index)}\ntitle: {i.title}\nurl: {i.url}\ncontent: {desc}\n\n", level=gu.LEVEL_DEBUG, max_len=9999)
ret += f"# No.{str(index)}\ntitle: {i.title}\nurl: {i.url}\ncontent: {desc}\n\n"
index += 1
except Exception as e:
print(f"google search err: {str(e)}")
return web_keyword_search_via_bing(keyword)
return ret
def web_keyword_search_via_bing(keyword) -> str:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
@@ -41,7 +64,7 @@ def web_keyword_search_via_bing(keyword) -> str:
try:
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
gu.log(f"bing response: {response.text}", tag="bing", level=gu.LEVEL_DEBUG)
gu.log(f"bing response: {response.text}", tag="bing", level=gu.LEVEL_DEBUG, max_len=9999)
soup = BeautifulSoup(response.text, "html.parser")
res = []
ols = soup.find(id="b_results")
@@ -88,7 +111,7 @@ def web_keyword_search_via_bing(keyword) -> str:
time.sleep(1)
print("fail to fetch bing info, using sougou.")
return web_keyword_search_via_sougou(keyword)
return google_web_search(keyword)
def web_keyword_search_via_sougou(keyword) -> str:
headers = {
@@ -131,29 +154,44 @@ def web_keyword_search_via_sougou(keyword) -> str:
return ret
def fetch_website_content(url):
gu.log(f"fetch_website_content: {url}", tag="fetch_website_content", level=gu.LEVEL_DEBUG)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
response = requests.get(url, headers=headers, timeout=3)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
res = soup.text
# 如果有container / content / main等的话就只取这些部分
has = False
beleive_ls = ["container", "content", "main"]
res = ""
for cls in beleive_ls:
for i in soup.find_all(class_=cls):
has = True
res += i.text
if not has:
res = soup.text
res = res.replace("\n", "").replace(" ", " ").replace("\r", "").replace("\t", "")
if not has:
res = res[300:1100]
else:
res = res[100:800]
with open(f"temp_{time.time()}.html", "w", encoding="utf-8") as f:
f.write(res)
gu.log(f"fetch_website_content: end", tag="fetch_website_content", level=gu.LEVEL_DEBUG)
return res
def web_search(question, provider, session_id):
new_func_call = FuncCall(provider)
new_func_call.add_func("web_keyword_search_via_bing", [{
new_func_call.add_func("google_web_search", [{
"type": "string",
"name": "keyword",
"brief": "必应搜索的关键词(分词,尽量保留所有信息)"
"brief": "google search query (分词,尽量保留所有信息)"
}],
"网页搜索。如果问题需要使用搜索,则调用。",
web_keyword_search_via_bing
"网页搜索。如果问题需要使用搜索(如天气、新闻或任何新的东西),则调用。",
google_web_search
)
new_func_call.add_func("fetch_website_content", [{
"type": "string",