由于单线程太慢,所以刚又写了个多进程,我来为大家科普一下关于草民网多进程程爬取好莱坞大片?下面希望有你要的答案,我们一起来看看吧!

草民网多进程程爬取好莱坞大片(草民网多进程程爬取好莱坞大片)

草民网多进程程爬取好莱坞大片

由于单线程太慢,所以刚又写了个多进程

from urllib import request

import urllib

from time import sleep

import socket

import random

from multiprocessing import Process

class Video():

def __init__(self):

self.headers = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3615.400"

self.url = ""

def set_url(self, i):

if i < 1000:

self.url = "https://zy.baidu-360-yyy-kubo.com/20180705/mBlJPrWZ/800kb/hls/FF9QaL5935d.ts" % i

else:

self.url = "https://zy.baidu-360-yyy-kubo.com/20180705/mBlJPrWZ/800kb/hls/FF9QaL5935d.ts" % i

# 获取并下载ts文件

def download_ts(self, i):

rq = request.Request(self.url)

rq.add_header('User-Agent', self.headers)

response = request.urlopen(rq)

res = response.read()

# 保存文件路径

with open('./HW/' str(i) ".ts", "wb") as f:

f.write(res)

response.close() # 关闭urlopen方法

def start(self, i):

self.set_url(i)

try:

self.download_ts(i)

print('第' str(i) ".ts" '已下载')

sleep(1)

except urllib.error.URLError as e:

print(e.reason)

self.download_ts(i)

except socket.timeout as e2:

print(e2.reason)

self.download_ts(i)

if __name__ == '__main__':

video = Video()

socket.setdefaulttimeout(random.randint(5, 10)) # 设置socket层超时时间20秒

I = 0

while I < 2000 1:

# 5个进程并发运行

p_l = [Process(target=video.start, args=(i,)) for i in range(I, I 5)]

for p in p_l:

p.start()

for p in p_l:

p.join()

I = I 5

,