Python实例题:Web 爬虫与数据可视化

发布于:2025-06-25 ⋅ 阅读:(17) ⋅ 点赞:(0)

目录

Python实例题

题目

要求:

解题思路:

代码实现:

Python实例题

题目

Web 爬虫与数据可视化

要求

  • 编写一个爬虫,从豆瓣电影 Top250 页面(豆瓣电影 Top 250)抓取电影名称、评分、导演、主演和上映年份。
  • 将数据存储到 SQLite 数据库中。
  • 基于数据库数据,使用 Matplotlib 生成柱状图,展示评分最高的 10 部电影。
  • 添加异常处理(如网络请求失败、解析错误、数据库操作失败等)。

解题思路

  • 使用 requests 和 BeautifulSoup 实现网页爬取与解析。
  • 使用 sqlite3 建立数据库并存储数据。
  • 使用 matplotlib 绘制柱状图。
  • 添加重试机制和异常捕获。

代码实现

import requests
from bs4 import BeautifulSoup
import sqlite3
import matplotlib.pyplot as plt
from requests.exceptions import RequestException
import time

def fetch_movie_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text
        except RequestException as e:
            print(f"请求失败 ({attempt+1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(2)  # 等待2秒后重试
    return None

def parse_movie_data(html_content):
    if not html_content:
        return []
    soup = BeautifulSoup(html_content, 'html.parser')
    movies = []
    for item in soup.select('div.item'):
        try:
            title = item.select_one('span.title').text.strip()
            rating = float(item.select_one('span.rating_num').text)
            info = item.select_one('div.bd p').text.strip()
            # 解析导演、主演和年份
            director, *actors = info.split('\xa0\xa0\xa0')[0].replace('导演: ', '').split('主演: ')
            year = info.split('\n')[-1].strip().split('/')[0]
            movies.append({
                'title': title,
                'rating': rating,
                'director': director.strip(),
                'actors': actors[0].strip() if actors else '',
                'year': year
            })
        except (AttributeError, ValueError, IndexError) as e:
            print(f"解析错误: {e}")
    return movies

def save_to_database(movies):
    conn = sqlite3.connect('douban_movies.db')
    c = conn.cursor()
    try:
        c.execute('''CREATE TABLE IF NOT EXISTS movies
                     (id INTEGER PRIMARY KEY AUTOINCREMENT,
                     title TEXT NOT NULL,
                     rating REAL NOT NULL,
                     director TEXT,
                     actors TEXT,
                     year TEXT)''')
        for movie in movies:
            c.execute(
                '''INSERT INTO movies (title, rating, director, actors, year)
                   VALUES (?, ?, ?, ?, ?)''',
                (movie['title'], movie['rating'], movie['director'], movie['actors'], movie['year']))
        conn.commit()
    except sqlite3.Error as e:
        print(f"数据库错误: {e}")
        conn.rollback()
    finally:
        conn.close()

def plot_top_movies():
    conn = sqlite3.connect('douban_movies.db')
    c = conn.cursor()
    try:
        c.execute('SELECT title, rating FROM movies ORDER BY rating DESC LIMIT 10')
        top_movies = c.fetchall()
        if not top_movies:
            print("数据库中没有电影数据")
            return
        
        titles, ratings = zip(*top_movies)
        plt.figure(figsize=(10, 6))
        plt.barh(titles, ratings, color='skyblue')
        plt.xlabel('评分')
        plt.ylabel('电影名称')
        plt.title('豆瓣电影评分Top10')
        plt.tight_layout()
        plt.savefig('top_movies.png')
        plt.show()
    except sqlite3.Error as e:
        print(f"数据库错误: {e}")
    finally:
        conn.close()

if __name__ == "__main__":
    all_movies = []
    for start in range(0, 250, 25):
        url = f"https://movie.douban.com/top250?start={start}"
        html = fetch_movie_data(url)
        movies = parse_movie_data(html)
        all_movies.extend(movies)
        print(f"已抓取 {start+25}/250 部电影")
        time.sleep(1)  # 避免请求过快
    
    if all_movies:
        save_to_database(all_movies)
        plot_top_movies()
    else:
        print("未抓取到任何电影数据")

网站公告

今日签到

点亮在社区的每一天
去签到