@
googlefans 谢谢,调试通过可用的代码:
```
# 原先安装 pip install requests beautifulsoup4
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 提示用户输入要抓取的网页 URL
url = input('请输入要抓取的网页 URL: ')
# 创建一个目录来保存下载的 PDF 文件
download_dir = 'pdf_downloads'
os.makedirs(download_dir, exist_ok=True)
# 获取网页内容,并显式指定编码为 UTF-8
response = requests.get(url)
response.encoding = 'utf-8'
response.raise_for_status()
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有的 PDF 链接
pdf_links = soup.find_all('a', href=True)
for link in pdf_links:
href = link['href']
if href.lower().endswith('.pdf'):
# 获取超链接文本作为文件名
file_name = link.get_text(strip=True) + '.pdf'
# 处理文件名中的非法字符
file_name = "".join(x for x in file_name if x.isalnum() or x in " ._-")
# 拼接完整的下载 URL
download_url = urljoin(url, href)
# 下载 PDF 文件
pdf_response = requests.get(download_url)
pdf_response.raise_for_status()
# 保存文件
file_path = os.path.join(download_dir, file_name)
with open(file_path, 'wb') as f:
f.write(pdf_response.content)
# 打印下载信息,确保控制台输出使用 UTF-8 编码
print(f"Downloaded: {file_name}")
print("All PDF files have been downloaded.")
```