前期我们介绍过使用xpath解析数据,这次在原基础上将爬取下的数据直接保存到MongoDB中。 参考代码如下: from lxml import etree import requests import re import pymongo headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0' } # 根据url抓取网页内容 def getOnePage(url): resp = requests.get(url, headers=headers) try: # 服务器响应正常 if resp.status_code == 200: return resp.text return None except Exception: return None # 分析HTML代码 xpath 获取内容 使用正则表达式匹配所需字符串 def parseOnePage(html): # 获取连接对象 client = pymongo.MongoClient() # 获取数据库对象如果db-books不存在新建 db = client['db-novels'] # 获取要操作的集合 如果此集合不存在 会新建 collection = db['collection-book'] selector_html = etree.HTML(html) #选取节点 获取所有的图书的div items = selector_html.xpath('//div[@class="doulist-item"]') # 遍历div for item in items: # 图书的图片地址 pic = item.xpath('.//div[@class="post"]/a/img/@src')[0] bname = item.xpath('.//div[@class="title"]/a/text()')[0] bname = re.search("\\w+", bname) bname = bname.group() rate = item.xpath('.//div[@class="rating"]/span[last()-1]/text()')[0] author = item.xpath('.//div[@class="abstract"]/text()')[0] author = re.search("(?<=作者:\\s)(.*)", author, re.M) if author is not None: author = author.group() company = item.xpath('.//div[@class="abstract"]/text()')[1] company = re.search("(?<=出版社:\\s)(.*)", company) company = company.group() date = item.xpath('.//div[@class="abstract"]/text()')[2] date = re.search("\\d{4}(-\\d{1,2})?", date) if date is not None: date = date.group() print(bname+'\t'+author+'\t'+company+'\t'+date+'\t'+rate+'\t'+pic) # 将数据存储在列表中 list = [['bname',bname],['author',author],['company',company],['b-date',date],['rate',rate],['pic-url',pic]] # 将列表转为字典类型 row = dict(list) print(row) # 将数据插入到数据库表中 collection.insert_one(row) #抓取URL页面,并保存到文件中 def getTop100(url): # 获取页面的数据 html = getOnePage(url) # 从页面提取图书信息并保存到MongoDB数据库中 parseOnePage(html) # 分页的四个Url地址 urls = ['https://www.douban.com/doulist/45004834/?start={}'.format( str(i) ) for i in range(0,100,25)] for url in urls: print(url) getTop100(url)
运行结果如下: