defdownload_picture(url): filename = url[url.rfind('/') + 1:] resp = requests.get(url) if resp.status_code == 200: withopen(f'images/beauty/{filename}', 'wb') as file: file.write(resp.content)
defmain(): ifnot os.path.exists('images/beauty'): os.makedirs('images/beauty') for page inrange(3): resp = requests.get(f'https://image.so.com/zjl?ch=beauty&sn={page * 30}') if resp.status_code == 200: pic_dict_list = resp.json()['list'] for pic_dict in pic_dict_list: download_picture(pic_dict['qhimg_url'])
if __name__ == '__main__': main()
在 macOS 或 Linux 系统上,我们可以使用time命令来了解上面代码的执行时间以及 CPU 的利用率,如下所示。
time python3 example04.py
下面是单线程爬虫代码在我的电脑上执行的结果。
python3 example04.py 2.36s user 0.39s system 12% cpu 21.578 total
这里我们只需要关注代码的总耗时为21.578秒,CPU 利用率为12%。
多线程版本
我们使用之前讲到过的线程池技术,将上面的代码修改为多线程版本。
""" example05.py - 多线程版本爬虫 """ import os from concurrent.futures import ThreadPoolExecutor
import requests
defdownload_picture(url): filename = url[url.rfind('/') + 1:] resp = requests.get(url) if resp.status_code == 200: withopen(f'images/beauty/{filename}', 'wb') as file: file.write(resp.content)
defmain(): ifnot os.path.exists('images/beauty'): os.makedirs('images/beauty') with ThreadPoolExecutor(max_workers=16) as pool: for page inrange(3): resp = requests.get(f'https://image.so.com/zjl?ch=beauty&sn={page * 30}') if resp.status_code == 200: pic_dict_list = resp.json()['list'] for pic_dict in pic_dict_list: pool.submit(download_picture, pic_dict['qhimg_url'])
if __name__ == '__main__': main()
执行如下所示的命令。
time python3 example05.py
代码的执行结果如下所示:
python3 example05.py 2.65s user 0.40s system 95% cpu 3.193 total
asyncdefdownload_picture(session, url): filename = url[url.rfind('/') + 1:] asyncwith session.get(url, ssl=False) as resp: if resp.status == 200: data = await resp.read() asyncwith aiofile.async_open(f'images/beauty/{filename}', 'wb') as file: await file.write(data)
asyncdeffetch_json(): asyncwith aiohttp.ClientSession() as session: for page inrange(3): asyncwith session.get( url=f'https://image.so.com/zjl?ch=beauty&sn={page * 30}', ssl=False ) as resp: if resp.status == 200: json_str = await resp.text() result = json.loads(json_str) for pic_dict in result['list']: await download_picture(session, pic_dict['qhimg_url'])