aiohttp是一个基于asyncio的异步HTTP网络模块 既提供服务端,又提供客户端
基本使用
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
return await response.text(), response.status
# 事件循环修复方案
import asyncio
import aiohttp
from aiohttp import ClientTimeout
import sys
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
async def main():
timeout = ClientTimeout(total=20)
async with aiohttp.ClientSession(timeout=timeout) as session:
html, status = await fetch(session, 'https://cuiqingcai.com/')
print(f'html:{html[:100]}...')
print(f'status:{status}')
if __name__ == '__main__':
asyncio.run(main())
所示结果如下:
获取了网页的源代码以及响应码200,完成了一次基本的HTTP请求,既成功使用aiohttp通过异步的方式完成了网页爬取
在导入库时,必须引入aiohttp这个库,还必须引入asyncio库。要实现异步爬取,需要启动协程,而协程需要借助于asyncio里面的事件循环才能执行。
案例:
import aiohttp
import logging
import asyncio
INDEX_URL = 'https://spa5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://spa5.scrape.center/api/book/{id}'
PAGE_SIZE = 18
PAGE_NUMBER = 100
CONCURRENCY = 5
semaphore = asyncio.Semaphore(CONCURRENCY)
session = None
async def scrape_api(url):
async with semaphore:
try:
logging.info('scraping %s', url)
async with session.get(url) as response:
return await response.json()
except aiohttp.ClientError:
logging.error('error occurred while scraping %s', url, exc_info=True)
async def scrape_index(page):
url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
return await scrape_api(url)
async def main():
global session
session = aiohttp.ClientSession()
scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1,PAGE_NUMBER + 1)]
results = await asyncio.gather(*scrape_index_tasks)
logging.info('results: %s', json.dumps(results, ensure_ascii=False, indent=2))
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())