selenium(练习)提取dou yu网站上的数据

发布于:2023-09-16 ⋅ 阅读:(62) ⋅ 点赞:(0)

运行代码时,它会打开斗鱼网站并逐个打印每个房间的相关信息 打印出每个房间的标题、类型、所有者、观看人数和封面图片

import time

from selenium import webdriver


class Douyu(object):
    def __init__(self):
        self.url = 'https://www.douyu.com/directory/all'
        self.driver = webdriver.Edge()

    def parse_data(self):
        time.sleep(3)
        room_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
        # print(len(room_list))
        # 遍历
        data_list = []
        for room in room_list:
            temp = {}
            temp['title'] = room.find_element_by_xpath('./a/div[2]/div[1]/h3').text
            temp['type'] = room.find_element_by_xpath('./a/div[2]/div[1]/span').text
            temp['owner'] = room.find_element_by_xpath('./a/div[2]/div[2]/h2').text
            temp['num'] = room.find_element_by_xpath('./a/div[2]/div[2]/span').text
            temp['picture'] = room.find_element_by_xpath('./a/div[1]/div[1]/picture/img').get_attribute('src')
            # print(temp)
            data_list.append(temp)
        return data_list

    def save_data(self, data_list):
        for data in data_list:
            print((data))

    def run(self):
        # url
        # driver
        # get
        self.driver.get(self.url)
        while True:
            # parse
            data_list = self.parse_data()
            # save
            self.save_data(data_list)
            # next page
            # try:
            el_next = self.driver.find_element_by_xpath('//*[@class= "dy-Pagination-next"]')
            self.driver.execute_script('scrollTo(0,10000000)')
            el_next.click()
            # except Exception:
            #     break

            #


if __name__ == '__main__':
    douyu = Douyu()
    douyu.run()