提取标签内容遇到的 BUG

2017-08-22 21:37:40 +08:00
 kerberos
#-*-coding:utf-8-*-

import requests
import re
import time
from urllib.parse import quote
from save import get_Mysql

from lxml import etree
import json


class mySpider(object):
def __init__(self,dbname,mykey,mycity):
self.dbname = dbname
self.key = mykey
self.city = mycity
self.start_url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p=1".format(quote(self.key),quote(self.city))
self.headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8,mt;q=0.6",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Host":"sou.zhaopin.com",
"Referer":"http://www.zhaopin.com/",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"
}
# self.mysql = get_Mysql(self.dbname,self.key,self.city)
# self.mysql.create_table()

#递归获取页面数据
def get_one_html(self,url):
data = {}
html = requests.get(url,headers=self.headers)
infos = etree.HTML(html.text)
selectors = infos.xpath('//*[@id="newlist_list_content_table"]/table')
for i in range(len(selectors)):
selector = selectors[i]
# 职位月薪
data["job_gz"] = selector.xpath('//tr[1]/td[4]/text()')
print(data["job_gz"])

# 招聘链接
data["job_link"] = re.findall('http://jobs.zhaopin.com/\d+.htm',html.text)

# 职位名称
data['job_name'] = re.findall('http://jobs.zhaopin.com/\d+.htm" target="_blank">(.*?)</a>',html.text,re.S)


# 公司名称
data["gsmc"] = [gsmc for gsmc in selector.xpath('//td[@class="gsmc"]/a[1]/text()')]

# 公司链接
data["gs_link"] = selector.xpath('//td[@class="gsmc"]/a[1]/@href')


# 工作地点
data["job_dd"] = selector.xpath('//td[@class="gzdd"]/text()')

# 学历 /经验要求
data["xlyq"] = selector.xpath('//li[@class="newlist_deatil_two"]/span[4]/text()')

# 公司规模
data["gsgm"] = selector.xpath('//li[@class="newlist_deatil_two"]/span[3]/text()')

# 公司性质
data["gsxz"] = selector.xpath('//li[@class="newlist_deatil_two"]/span[2]/text()')



def main(self):
self.get_one_html(self.start_url)
# try:
# self.get_one_html(self.start_url)
# except Exception as e:
# print(e)
# finally:
# self.mysql.close_table()

if __name__ == '__main__':
start = time.time()
s = mySpider('51job','北京','java')
s.main()
end = time.time()
print("耗时:{:.2f}秒".format(float(time.time()-start)))

----------------------------------------------分割线------------------------------------------------------
以上是源码,bug 出现在
selectors = infos.xpath('//*[@id="newlist_list_content_table"]/table')
for i in range(len(selectors)):
selector = selectors[i]
# 职位月薪
data["job_gz"] = selector.xpath('//tr[1]/td[4]/text()')
print(data["job_gz"])

我享得到是循环获取每个标签下的文本,结果一次循环就提取了所有的,得到的数据为
{'job_gz': ['10001-15000', '10000-20000', '15000-30000', '10000-18000', '面议', '10001-15000', '8001-10000', '6001-8000', '12000-20000', '8001-10000', '10001-15000', '10001-15000', '4001-6000', '15001-20000', '6001-8000', '面议', '6001-8000', '6001-8000', '15001-20000', '8001-10000', '10001-15000', '面议', '6001-8000', '8001-10000', '6001-8000', '15000-25000', '15000-20000', '12000-20000', '15001-20000', '6001-8000', '6001-8000', '20001-30000', '6001-8000', '10001-15000', '6001-8000', '6001-8000', '6001-8000', '8001-10000', '6001-8000', '15000-25000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '8001-10000', '2500-5000', '6001-8000']}
key 对应的 value 是 list,我想得到的是 key-value,一一对应的,不清楚哪里的逻辑出错了。请帮忙 debug
1366 次点击
所在节点    Python
0 条回复

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/384995

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX