我的代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
# 设置 Chrome 选项以启用 headless 模式和自定义 user-agent
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument(
f'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"'
)
# 初始化 WebDriver
driver = webdriver.Chrome(
executable_path="D:/lab/chromedriver-win64/chromedriver.exe", options=chrome_options
)
url = "https://webs.bjidex.com/sys-bsc-home/#/bscConsole/tradingMarket"
# 访问网页
driver.get(url)
# 初始化一个列表来保存数据
data_list = []
# 爬取数据
for page in range(184):
# 构建翻页按钮的 XPath
if page < 4:
next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[10]/button"
elif 4 < page < 181:
next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[12]/button"
elif 181 < page:
next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[10]/button"
else:
next_button_xpath = "/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[2]/ul/li[11]/button"
# 爬取每页的 10 组数据
for i in range(1, 11):
time.sleep(1) # 等待页面加载新内容
# 构建每组的 XPath
product_name_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[1]/div/span[1]"
supplier_list_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[3]/span"
product_type_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[1]/div/span[2]"
application_scenario_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[3]/div"
product_description_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[1]/div/div[2]"
price_xpath = f"/html/body/div[2]/div[2]/div/div/div[2]/div/div/section/div/div/div/div[7]/div/div[1]/div/ul/li[{i}]/div[2]/div[1]"
try:
# 供应商提供商品名称
product_name = driver.find_element(By.XPATH, product_name_xpath).text
# 数据供应商名单
supplier_list = driver.find_element(By.XPATH, supplier_list_xpath).text
# 商品类型
product_type = driver.find_element(By.XPATH, product_type_xpath).text
# 应用场景
application_scenario = driver.find_element(
By.XPATH, application_scenario_xpath
).text
# 商品描述
product_description = driver.find_element(
By.XPATH, product_description_xpath
).text
# 价格
price = driver.find_element(By.XPATH, price_xpath).text
# 将数据添加到列表
data_list.append(
{
"页数": page + 1,
"供应商提供商品名称": product_name,
"数据供应商名单": supplier_list,
"商品类型": product_type,
"应用场景": application_scenario,
"商品描述": product_description,
"价格": price,
}
)
except Exception as e:
print(f"Error on page {page + 1}, item {i}: {e}")
# 点击翻页按钮
try:
next_button = WebDriverWait(driver, 2).until( # 等待时间为 2 秒
EC.element_to_be_clickable((By.XPATH, next_button_xpath))
)
next_button.click()
except Exception as e:
print("翻页出错或已经是最后一页:", e)
break # 如果无法翻页,则跳出循环
# 关闭浏览器
driver.quit()
# 将列表转换为 DataFrame
data_df = pd.DataFrame(data_list)
# 输出为表格
data_df.to_csv(
"bjidex.com_data.csv", index=False, encoding="utf_8_sig"
) # 保存为 CSV 文件
print(data_df) # 打印 DataFrame
输出
PS D:\lab\bigdata24.9.9> & C:/tools/miniconda3/python.exe d:/lab/bigdata24.9.9/bjidex.com.py
DevTools listening on ws://127.0.0.1:60057/devtools/browser/66a61aa5-3598-4069-94bd-d4f10be20d96
[42892:8184:0915/232611.717:ERROR:ssl_client_socket_impl.cc(882)] handshake failed; returned -1, SSL error code 1, net_error -101
[42892:8184:0915/232611.834:ERROR:ssl_client_socket_impl.cc(882)] handshake failed; returned -1, SSL error code 1, net_error -101
[42892:8184:0915/232630.213:ERROR:ssl_client_socket_impl.cc(882)] handshake failed; returned -1, SSL error code 1, net_error -101
翻页出错或已经是最后一页: Message:
页数 ... 价格
0 1 ... 0.5 元/次
1 1 ... 0 元/次
2 1 ... 0 元/次
3 1 ... 2.5 元/次
...
37 4 ... 0.2 元/次
38 4 ... 0.1 元/次
39 4 ... 0.15 元/次
[40 rows x 7 columns]
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.