初学 python 。
近期公司由于业务原因,需要想办法获取到携程与去哪儿的机票信息。 于是我尝试用 python+urllib 对这两个网站上的信息进行抓取。
去哪儿的爬虫代码如下:(初学 python 。代码有很多不合理之处。望海涵。)
# -*- coding:utf-8 -*-
import urllib
from urllib import request
class QunaerSpider:
__query_flights_base_url = 'http://flight.qunar.com/twelli/longwell?'
__user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103'
__referer_base_url = 'http://flight.qunar.com/site/interroundtrip_compare.htm?'
__referer_end_url = '&from=fi_re_search&lowestPrice=null&isInter=true&favoriteKey=&showTotalPr=null'
def create_flights_query_url(self, from_city, from_date, to_city, to_date):
"""根据用户传参,返回组装好的机票请求用 String 格式的 url 数据
Args:
from_city: 出发城市
from_date: 出发日期
to_city: 抵达城市
to_date: 抵达日期
Returns:
返回组装好的机票请求用 String 格式的 url 数据.
Raise:
None
"""
from_city_encoded = urllib.request.quote(from_city.encode('utf-8'))
from_date_encoded = urllib.request.quote(from_date)
to_city_encoded = urllib.request.quote(to_city.encode('utf-8'))
to_date_encoded = urllib.request.quote(to_date)
url = QunaerSpider.__query_flights_base_url
# 初始化参数对象
parameter_dict = {}
parameter_dict['from'] = 'qunarindex'
parameter_dict['fromCity'] = from_city_encoded
parameter_dict['fromDate'] = from_date_encoded
parameter_dict['isInter'] = 'true'
parameter_dict['prePay'] = 'true'
parameter_dict['locale'] = 'zh'
parameter_dict['mergeFlag'] = '0'
parameter_dict['nextNDays'] = '0'
parameter_dict['op'] = '1'
parameter_dict['reset'] = 'true'
parameter_dict['searchLangs'] = 'zh'
parameter_dict['searchType'] = 'RoundTripFlight'
parameter_dict['toCity'] = to_city_encoded
parameter_dict['toDate'] = to_date_encoded
parameter_dict['version'] = 'thunder'
parameter_dict['http://www.travelco.com/searchArrivalAirport'] = to_city_encoded
parameter_dict['http://www.travelco.com/searchDepartureAirport'] = from_city_encoded
parameter_dict['http://www.travelco.com/searchDepartureTime'] = from_date_encoded
parameter_dict['http://www.travelco.com/searchReturnTime'] = to_date_encoded
# f+时间戳
parameter_dict['xd'] = 'f1469358871776'
parameter_dict['www'] = 'true'
parameter_dict['wyf'] = '0P8HfQ5%2F%2FYA%2FWldSERAyfudSERU0dUd0ERPj%3D%3D%3D%3D%7C1441321882698'
parameter_dict['departureCity'] = from_city_encoded
parameter_dict['arrivalCity'] = to_city_encoded
parameter_dict['departureDate'] = from_date_encoded
parameter_dict['returnDate'] = to_date_encoded
# token.加不加暂时看不出来什么影响
parameter_dict['_token'] = '6455'
# 拼装 query_rul
for k, v in parameter_dict.items():
url = url + '&' + k + '=' + v
print('请求字符串为\n%s' % url)
return url
def create_referer_url(self, from_city, from_date, to_city, to_date):
from_city_encoded = urllib.request.quote(from_city.encode('utf-8'))
from_date_encoded = urllib.request.quote(from_date)
to_city_encoded = urllib.request.quote(to_city.encode('utf-8'))
to_date_encoded = urllib.request.quote(to_date)
url = QunaerSpider.__referer_base_url
# 初始化参数对象
parameter_dict = {}
parameter_dict['from'] = 'qunarindex'
parameter_dict['fromCity'] = from_city_encoded
parameter_dict['fromDate'] = from_date_encoded
parameter_dict['toCity'] = to_city_encoded
parameter_dict['toDate'] = to_date_encoded
# TODO 暂时写死
parameter_dict['fromCode'] = 'CTU'
parameter_dict['toCode'] = 'TYO'
# 拼装 query_rul
for k, v in parameter_dict.items():
url = url + '&' + k + '=' + v
url += QunaerSpider.__referer_end_url
print('Referer 为\n%s' % url)
return url
def query_flights(self, url, referer_url):
"""根据用户传参,返回组装好的机票请求用 String 格式的 url 数据
Args:
url: 机票接口
Returns:
封装好的机票数据
Raise:
None
"""
req = request.Request(url)
req.add_header('Host', 'flight.qunar.com')
req.add_header('Accept', '*/*')
req.add_header('User-Agent', QunaerSpider.__user_agent)
req.add_header('Connection', 'keep-alive')
req.add_header('Accept-Encoding', 'gzip, deflate, sdch')
req.add_header('Content-Type', 'application/json')
req.add_header('Accept-Language', 'zh-CN,zh;q=0.8')
req.add_header('Referer', referer_url)
with request.urlopen(req) as f:
# 读取数据
data = f.read()
print(f.status, f.reason)
print('Data:', data.decode('utf-8'))
qunaerSpider = QunaerSpider()
referer_url = qunaerSpider.create_referer_url('成都', '2016-08-20', '东京', '2016-09-11')
url = qunaerSpider.create_flights_query_url('成都', '2016-08-20', '东京', '2016-09-11')
qunaerSpider.query_flights(url, referer_url)
去哪儿网遇到的问题
爬虫返回的信息为
200 OK Data: {isLimit:true}
使用同样的 url ,通过浏览器却能正常访问。想知道原因以及修正办法。
携程网遇到的问题
携程网的爬虫代码还没开始写。不过大概分析了下,携程网是 post 请求。 并且每次请求时会验证“ SearchKey ”与“ TransNo ”。关于这两个参数,有有经验的同学知道是在哪里,通过什么方式获取到的吗?附上携程的请求参数列表。
{
"FlightWay": "D",
"SegmentList": [{
"DCityCode": "CTU",
"ACityCode": "TYO",
"DCity": "Chengdu|成都(CTU)|28|CTU|480",
"ACity": "Tokyo|东京(TYO)|228|TYO|540",
"DepartDate": "2016-8-1"
}, {
"DCityCode": "TYO",
"ACityCode": "CTU",
"DCity": "Tokyo|东京(TYO)|228|TYO|480",
"ACity": "Chengdu|成都(CTU)|28|CTU|540",
"DepartDate": "2016-9-10"
}],
"TransferCityID": 0,
"Quantity": 1,
// 每次请求发生变化,携程会验证此 key
"TransNo": "5516072421000033701",
"SearchRandomKey": "",
"IsAsync": 1,
"RecommendedFlightSwitch": 1,
// 每次请求发生变化。携程会验证此 key
"SearchKey": "BEBFB6F8C0C56B8561A9B435AE822DF4D499B75C2FFA74D481318741A7F9537EFB59C5327342DE0D1A11D1E626A03C6C843FE6E311D4819F",
"MultiPriceUnitSwitch": 1,
"TransferCitySwitch": false,
"EngineScoreABTest": "B",
"AdjacentDateKey": "",
"SearchStrategySwitch": 1,
"MaxSearchCount": 3,
"TicketRemarkSwitch": 1,
"RowNum": "1500",
"TicketRemarkChannels": ["GDS-WS", "ZY-WS"],
"AddSearchLogOneByOne": true,
"TFAirlineQTE": "AA",
"IsWifiPackage": 0
}
谢谢各位。
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.