import requests
from lxml import etree
class Tieba(object):
def __init__(self,name):
self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}'.format(name)
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
def get_data(self,url):
response = requests.get(self.url, headers=self.headers)
return response.content
def parse_data(self,data):
html = etree.HTML(data)
el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
print(len(el_list))
def run(self):
data = self.get_data(self.url)
self.parse_data(data)
if __name__ == '__main__':
tieba = Tieba('龙之信条2')
tieba.run()
运行后发现结果是0,因为内容被注释了
解决方法1:
更改User-Agent,将其改为低版本的浏览器
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.01;Windows NT 5.0;DigExt)'
解决方法2:
用正则表达式去掉注释
data = data.decode().replace("<!--", "").replace("-->", "")
完整代码:
import requests
from lxml import etree
class Tieba(object):
def __init__(self,name):
self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw={}'.format(name)
self.headers = {
#'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.01;Windows NT 5.0;DigExt)'
# 内容是被注释的,但低端浏览器不会进行注释
}
def get_data(self,url):
response = requests.get(self.url, headers=self.headers)
return response.content
def parse_data(self,data):
# 或者这样把注释删去
#data = data.decode().replace("<!--", "").replace("-->", "")
html = etree.HTML(data)
el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
print(len(el_list))
data_list = []
for el in el_list:
temp = {}
temp['title'] = el.xpath('./text()')[0]
temp['link'] = 'http://tieba.baidu.com' + el.xpath('./@href')[0]
data_list.append(temp)
try:
next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
except:
next_url = None
return data_list,next_url
def save_data(self,data_list):
for data in data_list:
print(data)
def run(self):
while True:
data = self.get_data(self.url)
data_list, next_url = self.parse_data(data)
self.save_data(data_list)
print(next_url)
if next_url == None:
break
self.url = next_url
if __name__ == '__main__':
tieba = Tieba('龙之信条2')
tieba.run()
可完善的地方:save_data()方法,将数据保存为csv或xlsx文件
快来试试吧
可参考:【Python爬虫】基本操作中"数据存储——CSV文件"