python 的 multiprocessing

wangfeng3769

2014-08-04 15:00:14 +08:00

multiprocessing.freeze_support()
# lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()

wangfeng3769

2014-08-04 16:34:09 +08:00

Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python27\lib\multiprocessing\forking.py", line 380, in main
You have to input a complete URL
prepare(preparation_data)
Traceback (most recent call last):
File "D:\Python27\lib\multiprocessing\forking.py", line 495, in prepare
File "<string>", line 1, in <module>
'__parents_main__', file, path_name, etc
File "D:\Python27\lib\multiprocessing\forking.py", line 380, in main
File "D:\sworkspace\weixinspider.py", line 160, in <module>
prepare(preparation_data)
File "D:\Python27\lib\multiprocessing\forking.py", line 495, in prepare
main(url)
File "D:\sworkspace\weixinspider.py", line 147, in main
'__parents_main__', file, path_name, etc
File "D:\sworkspace\weixinspider.py", line 160, in <module>
w.start()
File "D:\Python27\lib\multiprocessing\process.py", line 130, in start
main(url)
File "D:\sworkspace\weixinspider.py", line 147, in main
self._popen = Popen(self)
File "D:\Python27\lib\multiprocessing\forking.py", line 258, in __init__
w.start()
File "D:\Python27\lib\multiprocessing\process.py", line 130, in start
cmd = get_command_line() + [rhandle]
self._popen = Popen(self)
File "D:\Python27\lib\multiprocessing\forking.py", line 358, in get_command_li
ne
File "D:\Python27\lib\multiprocessing\forking.py", line 258, in __init__
is not going to be frozen to produce a Windows executable.''')
RuntimeError: cmd = get_command_line() + [rhandle]

File "D:\Python27\lib\multiprocessing\forking.py", line 358, in get_command_li
ne
Attempt to start a new process before the current process
has finished its bootstrapping phase.

This probably means that you are on Windows and you have
forgotten to use the proper idiom in the main module:

if __name__ == '__main__':
freeze_support()
...

The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce a Windows executable.
is not going to be frozen to produce a Windows executable.''')
RuntimeError:
Attempt to start a new process before the current process
has finished its bootstrapping phase.

This probably means that you are on Windows and you have
forgotten to use the proper idiom in the main module:

if __name__ == '__main__':
freeze_support()
...

The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce a Windows executable.

wangfeng3769

2014-08-04 16:35:22 +08:00

求大仙们说一声。

wingao

2014-08-04 18:02:21 +08:00

if __name__ == '__main__':
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()

wingao

2014-08-04 18:03:41 +08:00

这个错误应该是你在其他的子进程里又开了进程
把创建进程放到 if __name__ == '__main__' 下

wangfeng3769

2014-08-05 15:53:10 +08:00

#coding:utf-8
import re
import os
import requests as R
from BeautifulSoup import BeautifulSoup as BS
import multiprocessing
import urlparse
import time
opt = "Mozilla/5.0 (Linux; U; Android 4.1.2; zh-cn; GT-I9300 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30 MicroMessenger/5.2.380"
headers = {'User-Agent':opt}
a,b = multiprocessing.Pipe()
domain_url = "66365.m.weimob.com"
G_queue_url = []
G_spidered_url = []

def is_existed(file_real_path):
i=1
while 1:
if i==1:
file_real_path_tem = file_real_path+'%s.htm'%""
if os.path.isfile(file_real_path_tem):
file_real_path_tem = file_real_path+'_%s.htm'%str(i)
else:
return file_real_path_tem

i=i+1

def get_web_page(url):
try:
r = R.get(url,headers=headers)
html = r.text
except:
return None

if html:
return html
else:
return None

def btree(O):
if O:
return BS(O,fromEncoding="utf-8")
else:
return None

def download():
url = "http://66365.m.weimob.com/weisite/home?pid=66365&bid=135666&wechatid=oAdrCtzBdLhgpyIOYtBNELkWXJ68&wxref=mp.weixin.qq.com"
print 'download'
checked_list = []

while True:
print 'I am busy'

recv_data = b.recv()
# recv_data = [url]
# print recv_data
if type(recv_data)!=type([]):
if recv_data ==0:
break

for url in recv_data:
print url
if url in checked_list:
# checked_list.append(url)
continue
else:
checked_list.append(url)

if re.search(domain_url,url):
url_list = urlparse.urlparse(url)
domain_folder = url_list[1]
file_path = url_list.path
real_path_r = os.path.sep.join([domain_folder,file_path])
real_path_l = re.split(r'/|\\',real_path_r)
# print real_path_l
if len(real_path_l)==2:
if not real_path_l[-1]:
continue
real_path_f = os.path.sep.join(real_path_l[0:-1])
real_path_r = is_existed(real_path_r)
try:
if not os.path.exists(real_path_f) :
os.makedirs(real_path_f)
try:
f = open(real_path_r,'w')
except :
open(real_path_r).close()
f = open(real_path_r,'w')
else:
try:
f = open(real_path_r,'w')
except :
open(real_path_r).close()
f = open(real_path_r,'w')
r = R.get(url)
content = unicode(r.text).encode(r.encoding,'ignore')
if not content:
continue
f.write(content)
f.close()
except:
pass
else:
pass

def get_links(html):
soup = btree(html)
# print soup
if not soup:
return []
a_links = soup.findAll('a')
if not a_links:
return []
link_list = []
for link in a_links:
# print link
try:
link = link.get('href')
if not link:
continue
except:
# print link
continue

if not re.search(domain_url,link) and not re.search('http', link):
link_list.append("http://"+domain_url+link)
return link_list

def work(url):

global G_spidered_url
global G_queue_url
# print G_spidered_url,G_queue_url
G_spidered_url.append(url)
html = get_web_page(url)
all_links = get_links(html)
send_list=[]
if G_queue_url and all_links:
for slink in all_links:
if slink not in G_queue_url:
send_list .append(slink)
G_queue_url.append(slink)
a.send(send_list)
elif not G_queue_url and all_links :

G_queue_url = all_links
a.send(all_links)

for url in G_queue_url:
if url in G_spidered_url:
continue
else:
G_spidered_url.append(url)
work(url)
a.send(0)

def main(url):
multiprocessing.freeze_support()
lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()

if __name__ == '__main__':
url= "http://66365.m.weimob.com/weisite/home?pid=66365&bid=135666&wechatid=oAdrCtzBdLhgpyIOYtBNELkWXJ68&wxref=mp.weixin.qq.com"

import sys
try:
url = sys.argv[1]
except:
print "You have to input a complete URL"
# main(url)
multiprocessing.freeze_support()
lock = multiprocessing.Lock()
w = multiprocessing.Process(target=work, args=(url, ))
nw = multiprocessing.Process(target=download, args=())
w.start()
nw.start()
w.join()
nw.join()

想说一下在windows下无法运行download ，看一下怎么回事，专门扒人家网站的爬虫。希望copy下来试试。祝大家好运。

wangfeng3769

2014-08-05 15:54:05 +08:00

有点流氓但是呢现在就这样了。

wangfeng3769

2014-08-05 16:09:36 +08:00

https://github.com/wangfeng3769/weixinspider/blob/master/weixinspider.py

wangfeng3769

2014-08-05 16:10:11 +08:00

退出的时候有点小问题，不知道哪儿出了问题。