V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
V2EX  ›  wangfeng3769  ›  全部回复第 42 页 / 共 53 页
回复总数  1060
1 ... 38  39  40  41  42  43  44  45  46  47 ... 53  
2014 年 8 月 9 日
回复了 wangfeng3769 创建的主题 Python python 下的多线程选哪个比较好?
@skybr 希望试一试,确实很爽。但是记住千万别默认安装。
2014 年 8 月 9 日
回复了 wangfeng3769 创建的主题 Python python 下的多线程选哪个比较好?
http://architects.dzone.com/articles/install-stackless-python 完美安装不影响原来的,但是stackless之后不能import django 等一些第三方库。
@skybr
2014 年 8 月 9 日
回复了 wangfeng3769 创建的主题 Python python 下的多线程选哪个比较好?
stackless 完美安装。
2014 年 8 月 6 日
回复了 yuelang85 创建的主题 酷工作 超级英雄 制作公司,寻找 python 程序员
remote 可以吗!
2014 年 8 月 6 日
回复了 ksex 创建的主题 Linux Linux 基金会推出免费课程《Linux 导论》
写的不好跟鸟哥的私房菜查多了。
2014 年 8 月 5 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
退出的时候有点小问题,不知道哪儿出了问题。
2014 年 8 月 5 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
好好写代码,看代码
2014 年 8 月 5 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
有点流氓但是呢 现在就这样了。
2014 年 8 月 5 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
#coding:utf-8
import re
import os
import requests as R
from BeautifulSoup import BeautifulSoup as BS
import multiprocessing
import urlparse
import time
opt = "Mozilla/5.0 (Linux; U; Android 4.1.2; zh-cn; GT-I9300 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 MicroMessenger/5.2.380"
headers = {'User-Agent':opt}
a,b = multiprocessing.Pipe()
domain_url = "66365.m.weimob.com"
G_queue_url = []
G_spidered_url = []

def is_existed(file_real_path):
i=1
while 1:
if i==1:
file_real_path_tem = file_real_path+'%s.htm'%""
if os.path.isfile(file_real_path_tem):
file_real_path_tem = file_real_path+'_%s.htm'%str(i)
else:
return file_real_path_tem

i=i+1


def get_web_page(url):
try:
r = R.get(url,headers=headers)
html = r.text
except:
return None

if html:
return html
else:
return None

def btree(O):
if O:
return BS(O,fromEncoding="utf-8")
else:
return None

def download():
url = "http://66365.m.weimob.com/weisite/home?pid=66365&bid=135666&wechatid=oAdrCtzBdLhgpyIOYtBNELkWXJ68&wxref=mp.weixin.qq.com"
print 'download'
checked_list = []

while True:
print 'I am busy'


recv_data = b.recv()
# recv_data = [url]
# print recv_data
if type(recv_data)!=type([]):
if recv_data ==0:
break

for url in recv_data:
print url
if url in checked_list:
# checked_list.append(url)
continue
else:
checked_list.append(url)

if re.search(domain_url,url):
url_list = urlparse.urlparse(url)
domain_folder = url_list[1]
file_path = url_list.path
real_path_r = os.path.sep.join([domain_folder,file_path])
real_path_l = re.split(r'/|\\',real_path_r)
# print real_path_l
if len(real_path_l)==2:
if not real_path_l[-1]:
continue
real_path_f = os.path.sep.join(real_path_l[0:-1])
real_path_r = is_existed(real_path_r)
try:
if not os.path.exists(real_path_f) :
os.makedirs(real_path_f)
try:
f = open(real_path_r,'w')
except :
open(real_path_r).close()
f = open(real_path_r,'w')
else:
try:
f = open(real_path_r,'w')
except :
open(real_path_r).close()
f = open(real_path_r,'w')
r = R.get(url)
content = unicode(r.text).encode(r.encoding,'ignore')
if not content:
continue
f.write(content)
f.close()
except:
pass
else:
pass

def get_links(html):
soup = btree(html)
# print soup
if not soup:
return []
a_links = soup.findAll('a')
if not a_links:
return []
link_list = []
for link in a_links:
# print link
try:
link = link.get('href')
if not link:
continue
except:
# print link
continue

if not re.search(domain_url,link) and not re.search('http', link):
link_list.append("http://"+domain_url+link)
return link_list

def work(url):

global G_spidered_url
global G_queue_url
# print G_spidered_url,G_queue_url
G_spidered_url.append(url)
html = get_web_page(url)
all_links = get_links(html)
send_list=[]
if G_queue_url and all_links:
for slink in all_links:
if slink not in G_queue_url:
send_list .append(slink)
G_queue_url.append(slink)
a.send(send_list)
elif not G_queue_url and all_links :

G_queue_url = all_links
a.send(all_links)

for url in G_queue_url:
if url in G_spidered_url:
continue
else:
G_spidered_url.append(url)
work(url)
a.send(0)

def main(url):
multiprocessing.freeze_support()
lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()


if __name__ == '__main__':
url= "http://66365.m.weimob.com/weisite/home?pid=66365&bid=135666&wechatid=oAdrCtzBdLhgpyIOYtBNELkWXJ68&wxref=mp.weixin.qq.com"

import sys
try:
url = sys.argv[1]
except:
print "You have to input a complete URL"
# main(url)
multiprocessing.freeze_support()
lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()


想说一下 在windows下无法运行download ,看一下怎么回事,专门扒人家网站的爬虫。希望copy下来试试。祝大家好运。
2014 年 8 月 4 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
求大仙们说一声。
2014 年 8 月 4 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python27\lib\multiprocessing\forking.py", line 380, in main
You have to input a complete URL
prepare(preparation_data)
Traceback (most recent call last):
File "D:\Python27\lib\multiprocessing\forking.py", line 495, in prepare
File "<string>", line 1, in <module>
'__parents_main__', file, path_name, etc
File "D:\Python27\lib\multiprocessing\forking.py", line 380, in main
File "D:\sworkspace\weixinspider.py", line 160, in <module>
prepare(preparation_data)
File "D:\Python27\lib\multiprocessing\forking.py", line 495, in prepare
main(url)
File "D:\sworkspace\weixinspider.py", line 147, in main
'__parents_main__', file, path_name, etc
File "D:\sworkspace\weixinspider.py", line 160, in <module>
w.start()
File "D:\Python27\lib\multiprocessing\process.py", line 130, in start
main(url)
File "D:\sworkspace\weixinspider.py", line 147, in main
self._popen = Popen(self)
File "D:\Python27\lib\multiprocessing\forking.py", line 258, in __init__
w.start()
File "D:\Python27\lib\multiprocessing\process.py", line 130, in start
cmd = get_command_line() + [rhandle]
self._popen = Popen(self)
File "D:\Python27\lib\multiprocessing\forking.py", line 358, in get_command_li
ne
File "D:\Python27\lib\multiprocessing\forking.py", line 258, in __init__
is not going to be frozen to produce a Windows executable.''')
RuntimeError: cmd = get_command_line() + [rhandle]

File "D:\Python27\lib\multiprocessing\forking.py", line 358, in get_command_li
ne
Attempt to start a new process before the current process
has finished its bootstrapping phase.

This probably means that you are on Windows and you have
forgotten to use the proper idiom in the main module:

if __name__ == '__main__':
freeze_support()
...

The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce a Windows executable.
is not going to be frozen to produce a Windows executable.''')
RuntimeError:
Attempt to start a new process before the current process
has finished its bootstrapping phase.

This probably means that you are on Windows and you have
forgotten to use the proper idiom in the main module:

if __name__ == '__main__':
freeze_support()
...

The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce a Windows executable.
2014 年 8 月 4 日
回复了 wangfeng3769 创建的主题 问与答 python 的 multiprocessing
multiprocessing.freeze_support()
# lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()
有坚持,有毅力。
2014 年 8 月 1 日
回复了 niuer 创建的主题 分享发现 喜获融资数千万美元 七牛云存储来 V2EX 送福利
2014 年 8 月 1 日
回复了 wangfeng3769 创建的主题 问与答 局域网封了 github,怎样才能正常使用 github
各位都很牛逼呀。
2014 年 7 月 31 日
回复了 wangfeng3769 创建的主题 问与答 局域网封了 github,怎样才能正常使用 github
Ubuntu有事 xp 没事
2014 年 7 月 31 日
回复了 wangfeng3769 创建的主题 问与答 局域网封了 github,怎样才能正常使用 github
局域网是不能穿越的吗?
@tankb52 server版的用过,命令行也不错,但是desktop版的就不好说了。
1 ... 38  39  40  41  42  43  44  45  46  47 ... 53  
关于   ·   帮助文档   ·   自助推广系统   ·   博客   ·   API   ·   FAQ   ·   Solana   ·   3044 人在线   最高记录 6679   ·     Select Language
创意工作者们的社区
World is powered by solitude
VERSION: 3.9.8.5 · 31ms · UTC 13:48 · PVG 21:48 · LAX 06:48 · JFK 09:48
♥ Do have faith in what you're doing.