【Python爬虫】案例_百度贴吧

发布于:2024-05-22 ⋅ 阅读:(51) ⋅ 点赞:(0)
import requests
from lxml import etree

class Tieba(object):

    def __init__(self,name):
        self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}'.format(name)
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
        }

    def get_data(self,url):
        response = requests.get(self.url, headers=self.headers)
        return response.content

    def parse_data(self,data):
        html = etree.HTML(data)
        el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        print(len(el_list))

    def run(self):
        data = self.get_data(self.url)
        self.parse_data(data)

if __name__ == '__main__':
    tieba = Tieba('龙之信条2')
    tieba.run()

运行后发现结果是0,因为内容被注释了

解决方法1:

更改User-Agent,将其改为低版本的浏览器

'User-Agent':'Mozilla/4.0(compatible;MSIE 5.01;Windows NT 5.0;DigExt)'

解决方法2:

用正则表达式去掉注释

data = data.decode().replace("<!--", "").replace("-->", "")

完整代码:

import requests
from lxml import etree

class Tieba(object):

    def __init__(self,name):
        self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}'.format(name)
        self.headers = {
            #'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
            'User-Agent':'Mozilla/4.0(compatible;MSIE 5.01;Windows NT 5.0;DigExt)'
            # 内容是被注释的,但低端浏览器不会进行注释
        }

    def get_data(self,url):
        response = requests.get(self.url, headers=self.headers)
        return response.content

    def parse_data(self,data):
        # 或者这样把注释删去
        #data = data.decode().replace("<!--", "").replace("-->", "")
        html = etree.HTML(data)
        el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        print(len(el_list))
        data_list = []
        for el in el_list:
            temp = {}
            temp['title'] = el.xpath('./text()')[0]
            temp['link'] = 'http://tieba.baidu.com' + el.xpath('./@href')[0]
            data_list.append(temp)

        try:
            next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
        except:
            next_url = None

        return data_list,next_url

    def save_data(self,data_list):
        for data in data_list:
            print(data)

    def run(self):
        while True:
          data = self.get_data(self.url)
          data_list, next_url = self.parse_data(data)
          self.save_data(data_list)
          print(next_url)
          if next_url == None:
              break
          self.url = next_url

if __name__ == '__main__':
    tieba = Tieba('龙之信条2')
    tieba.run()

可完善的地方:save_data()方法,将数据保存为csv或xlsx文件

快来试试吧

可参考:【Python爬虫】基本操作中"数据存储——CSV文件"


网站公告

今日签到

点亮在社区的每一天
去签到