目录
Python实例题
题目
Web 爬虫与数据可视化
要求:
- 编写一个爬虫,从豆瓣电影 Top250 页面(豆瓣电影 Top 250)抓取电影名称、评分、导演、主演和上映年份。
- 将数据存储到 SQLite 数据库中。
- 基于数据库数据,使用 Matplotlib 生成柱状图,展示评分最高的 10 部电影。
- 添加异常处理(如网络请求失败、解析错误、数据库操作失败等)。
解题思路:
- 使用
requests
和BeautifulSoup
实现网页爬取与解析。 - 使用
sqlite3
建立数据库并存储数据。 - 使用
matplotlib
绘制柱状图。 - 添加重试机制和异常捕获。
代码实现:
import requests
from bs4 import BeautifulSoup
import sqlite3
import matplotlib.pyplot as plt
from requests.exceptions import RequestException
import time
def fetch_movie_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
max_retries = 3
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except RequestException as e:
print(f"请求失败 ({attempt+1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(2) # 等待2秒后重试
return None
def parse_movie_data(html_content):
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
movies = []
for item in soup.select('div.item'):
try:
title = item.select_one('span.title').text.strip()
rating = float(item.select_one('span.rating_num').text)
info = item.select_one('div.bd p').text.strip()
# 解析导演、主演和年份
director, *actors = info.split('\xa0\xa0\xa0')[0].replace('导演: ', '').split('主演: ')
year = info.split('\n')[-1].strip().split('/')[0]
movies.append({
'title': title,
'rating': rating,
'director': director.strip(),
'actors': actors[0].strip() if actors else '',
'year': year
})
except (AttributeError, ValueError, IndexError) as e:
print(f"解析错误: {e}")
return movies
def save_to_database(movies):
conn = sqlite3.connect('douban_movies.db')
c = conn.cursor()
try:
c.execute('''CREATE TABLE IF NOT EXISTS movies
(id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
rating REAL NOT NULL,
director TEXT,
actors TEXT,
year TEXT)''')
for movie in movies:
c.execute(
'''INSERT INTO movies (title, rating, director, actors, year)
VALUES (?, ?, ?, ?, ?)''',
(movie['title'], movie['rating'], movie['director'], movie['actors'], movie['year']))
conn.commit()
except sqlite3.Error as e:
print(f"数据库错误: {e}")
conn.rollback()
finally:
conn.close()
def plot_top_movies():
conn = sqlite3.connect('douban_movies.db')
c = conn.cursor()
try:
c.execute('SELECT title, rating FROM movies ORDER BY rating DESC LIMIT 10')
top_movies = c.fetchall()
if not top_movies:
print("数据库中没有电影数据")
return
titles, ratings = zip(*top_movies)
plt.figure(figsize=(10, 6))
plt.barh(titles, ratings, color='skyblue')
plt.xlabel('评分')
plt.ylabel('电影名称')
plt.title('豆瓣电影评分Top10')
plt.tight_layout()
plt.savefig('top_movies.png')
plt.show()
except sqlite3.Error as e:
print(f"数据库错误: {e}")
finally:
conn.close()
if __name__ == "__main__":
all_movies = []
for start in range(0, 250, 25):
url = f"https://movie.douban.com/top250?start={start}"
html = fetch_movie_data(url)
movies = parse_movie_data(html)
all_movies.extend(movies)
print(f"已抓取 {start+25}/250 部电影")
time.sleep(1) # 避免请求过快
if all_movies:
save_to_database(all_movies)
plot_top_movies()
else:
print("未抓取到任何电影数据")