Python 爬虫 ip 被封,公司给了个付费快代理接口,我先验证付费接口 ip 的可用性,然后拿来爬目标网站,还是出现

2018-06-19 20:22:23 +08:00
 U87

import requests from lxml import etree import time, random from random import choice

def get_proxy(): url = 'http://svip.kuaidaili.com/api/getproxy/?orderid=&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&quality=2&sep=1' proxy_temp = requests.get(url=url, timeout=1).text

proxy = {'http':'http://{}'.format(proxy_temp)}
if requests.get(url='http://nj.58.com/chuzu/?key=%E7%A7%9F%E6%88%BF', proxies=proxy).status_code == 200:
    return proxy
else:
    get_proxy()

def crawl(): frist_url = 'http://nj.58.com/chuzu'

headers = [{'User-Agent':'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'}, {'User-Agent':'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}, {'User-Agent':'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'}, {'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'}]

s = requests.session()
s.keep_alive = False

try:
    resp = requests.get(url=frist_url, timeout=0.5).text

except requests.exceptions.RequestException as e:
    print(e)



attr = etree.HTML(resp)

max_page = attr.xpath('//div[@class="pager"]/a/span/text()')[-2]

for page in range(1, int(max_page)+1):

    next_url = frist_url + "/pn" + str(page)

    response = requests.get(url=next_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text

    attr = etree.HTML(response)

    detail_urls = attr.xpath('//ul[@class="listUl"]/li/div[@class="img_list"]/a/@href')

    for detail_url in detail_urls:

        time.sleep(random.random()*3)

        try:
            s = requests.session()
            s.keep_alive = False

            r = requests.get(url=detail_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text

        except requests.exceptions.RequestException as e:

            print(e)

        html = etree.HTML(r)

        if "pinpaigongyu" in detail_url:
            phone = str(html.xpath('//div[@class="phonenum getPrivateCallBtnStyle"]/text()'))
            rent_type = html.xpath('//div[@class="housedetail center cf"]/h2/text()')[0].split()[0].split('] ')[0].split(' [')[1]
            area = html.xpath('//ul[@class="house-info-list"]/li[1]/span/text()')[0].split()[0]+"平"
            room_type = html.xpath('//ul[@class="house-info-list"]/li[2]/span/text()')[0].split()[0]
            addres = html.xpath('//ul[@class="house-info-list"]/li[4]/span/text()')[0].strip()
            traffic = str(html.xpath('//ul[@class="house-info-list"]/li[5]/span/text()'))
            pictures = html.xpath('//ul[@id="pic-list"]/li/img/@lazy_src')
            house_description = html.xpath('//p[@id="desc"]/text()')[0].replace(' ','')
            print(phone)




        else:
            phone = str(html.xpath('//div[@class="house-chat-phonenum"]/p[@class="phone-num"]/text()'))
            rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0].split('-')[0]
            area = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[1]+"平"
            room_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[0]
            addres = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0].strip()
            traffic = str(html.xpath('//ul[@class="f14"]/li[5]/em/text()'))
            pictures = html.xpath('//ul[@id="housePicList"]/li/img/@lazy_src')  
            house_description = str(html.xpath('//ul[@class="introduce-item"]/li[2]/span[@class="a2"]//text()')).strip()
            print(phone)

if name == 'main': crawl()

2078 次点击
所在节点    问与答
1 条回复
U87
2018-06-19 20:24:34 +08:00
还是出现 requests.exceptions.ProxyError 难道是在验证和爬目标网站这时间之间 ip 失效了?

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/464206

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX