Datawhale 网络爬虫技术入门第2次笔记

发布于:2025-06-21 ⋅ 阅读:(21) ⋅ 点赞:(0)

正则表达式

正则表达式(Regular Expression),⼀种使⽤表达式的⽅式对字符串进⾏匹配的语法规则。

正则的语法:使⽤元字符进⾏排列组合来匹配字符串。

在线测试正则表达式:在线正则表达式测试OSCHINA.NET在线工具,ostools为开发设计人员提供在线工具,提供jsbin在线 CSS、JS 调试,在线 Java API文档,在线 PHP API文档,在线 Node.js API文档,Less CSS编译器,MarkDown编译器等其他在线工具https://tool.oschina.net/regex/

元字符:具有固定含义的特殊符号。

.*? 表示尽可能少的匹配,.*表示尽可能多的匹配。

Re模块

案例1、手刃豆瓣TOP250电影信息

import requests
import re
import csv
import time
import random

# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://movie.douban.com/top250"
}

# 定义正则表达式
pattern = re.compile(
    r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'
    r'<br>\s*(?P<year>\d{4}).*?'
    r'<span class="rating_num"[^>]*>(?P<score>\d+\.\d+)</span>.*?'
    r'<span>(?P<num>[\d,]+)人评价</span>',
    re.S
)

# 创建 CSV 文件并写入表头
with open("douban_top250.csv", mode="w", newline="", encoding="utf-8-sig") as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(["电影名称", "上映年份", "评分", "评分人数"])

    # 抓取10页(每页25部电影)
    for start in range(0, 250, 25):
        url = f"https://movie.douban.com/top250?start={start}"
        print(f"正在抓取第 {start//25 + 1} 页:{url}")

        try:
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text

            # 保存页面用于调试(可选)
            with open(f"page_debug_{start}.html", "w", encoding="utf-8") as f_debug:
                f_debug.write(html)

            matches = list(pattern.finditer(html))
            print(f"✅ 第 {start//25 + 1} 页成功,匹配到 {len(matches)} 条")

            for m in matches:
                name = m.group("name").strip()
                year = m.group("year").strip()
                score = m.group("score").strip()
                num = m.group("num").replace(",", "").strip()

                csvwriter.writerow([name, year, score, num])

            # 加延时防止被反爬
            time.sleep(random.uniform(1, 2))

        except Exception as e:
            print(f"❌ 抓取失败:{e}")

print("🎉 所有页面抓取完毕,已保存到 douban_top250.csv")

结果:

案例2、Quotes to Scrape 爬此网站

import re
import requests

def scrape_quotes():
    url = "http://quotes.toscrape.com/"
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Failed to fetch page")
        return
    
    html = response.text
    
    # 正则匹配所有名言区块
    quotes_pattern = re.compile(
        r'<div class="quote".*?>(.*?)</div>', 
        re.DOTALL
    )
    quotes = quotes_pattern.findall(html)
    
    results = []
    for quote in quotes:
        # 提取名言文本
        text_match = re.search(
            r'<span class="text".*?>(.*?)</span>', 
            quote
        )
        text = text_match.group(1).strip() if text_match else "N/A"
        
        # 提取作者
        author_match = re.search(
            r'<small class="author".*?>(.*?)</small>', 
            quote
        )
        author = author_match.group(1).strip() if author_match else "N/A"
        
        # 提取标签
        tags = re.findall(
            r'<a class="tag".*?>(.*?)</a>', 
            quote
        )
        
        results.append({
            "text": text,
            "author": author,
            "tags": tags
        })
    
    # 打印结果
    for i, result in enumerate(results, 1):
        print(f"Quote {i}:")
        print(f"  Text: {result['text']}")
        print(f"  Author: {result['author']}")
        print(f"  Tags: {', '.join(result['tags'])}\n")

if __name__ == "__main__":
    scrape_quotes()

结果显示:

 bs4模块

爬虫【唯美图片】唯美意境图片_唯美图片大全_好看的图片 唯美 - 优美图库https://www.umei.cc/weimeitupian/

import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin

# 创建图片文件夹(如果不存在)
img_dir = "img"
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

headers = {
    "User-Agent": "Mozilla/5.0"
}

# 已经提取到的子页面链接(你可以替换成真实链接列表)
page_urls = [
    "https://www.umei.cc/weimeitupian/oumeitupian/331451.htm",
    "https://www.umei.cc/weimeitupian/oumeitupian/331450.htm",
    # 可继续添加更多链接
]

for index, page_url in enumerate(page_urls):
    try:
        print(f"正在处理第 {index + 1} 个子页面: {page_url}")
        resp = requests.get(page_url, headers=headers, timeout=10)
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, "html.parser")

        # 有些页面在 .big-pic 或 .content 或 .image 或 #img 都可能出现图片
        img_tags = soup.find_all("img")
        img_count = 0

        for img_tag in img_tags:
            img_url = img_tag.get("src") or img_tag.get("data-original")
            if img_url and img_url.startswith("http") and ".jpg" in img_url:
                img_count += 1
                filename = os.path.join(img_dir, f"page{index + 1}_{img_count}.jpg")

                # 下载图片
                img_data = requests.get(img_url, headers=headers).content
                with open(filename, 'wb') as f:
                    f.write(img_data)
                print(f"已保存图片:{filename}")

        if img_count == 0:
            print("⚠️ 没有找到有效图片")

        time.sleep(1)  # 防止爬得太快被封

    except Exception as e:
        print(f"❌ 处理页面 {page_url} 时出错:{e}")

print("🎉 所有子页面处理完成!")

结果显示:


网站公告

今日签到

点亮在社区的每一天
去签到