正则表达式
正则表达式(Regular Expression),⼀种使⽤表达式的⽅式对字符串进⾏匹配的语法规则。
正则的语法:使⽤元字符进⾏排列组合来匹配字符串。
元字符:具有固定含义的特殊符号。
.*?
表示尽可能少的匹配,.*
表示尽可能多的匹配。
Re模块
案例1、手刃豆瓣TOP250电影信息
import requests
import re
import csv
import time
import random
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36",
"Referer": "https://movie.douban.com/top250"
}
# 定义正则表达式
pattern = re.compile(
r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'
r'<br>\s*(?P<year>\d{4}).*?'
r'<span class="rating_num"[^>]*>(?P<score>\d+\.\d+)</span>.*?'
r'<span>(?P<num>[\d,]+)人评价</span>',
re.S
)
# 创建 CSV 文件并写入表头
with open("douban_top250.csv", mode="w", newline="", encoding="utf-8-sig") as f:
csvwriter = csv.writer(f)
csvwriter.writerow(["电影名称", "上映年份", "评分", "评分人数"])
# 抓取10页(每页25部电影)
for start in range(0, 250, 25):
url = f"https://movie.douban.com/top250?start={start}"
print(f"正在抓取第 {start//25 + 1} 页:{url}")
try:
resp = requests.get(url, headers=headers, timeout=10)
resp.encoding = resp.apparent_encoding
html = resp.text
# 保存页面用于调试(可选)
with open(f"page_debug_{start}.html", "w", encoding="utf-8") as f_debug:
f_debug.write(html)
matches = list(pattern.finditer(html))
print(f"✅ 第 {start//25 + 1} 页成功,匹配到 {len(matches)} 条")
for m in matches:
name = m.group("name").strip()
year = m.group("year").strip()
score = m.group("score").strip()
num = m.group("num").replace(",", "").strip()
csvwriter.writerow([name, year, score, num])
# 加延时防止被反爬
time.sleep(random.uniform(1, 2))
except Exception as e:
print(f"❌ 抓取失败:{e}")
print("🎉 所有页面抓取完毕,已保存到 douban_top250.csv")
结果:
案例2、Quotes to Scrape 爬此网站
import re
import requests
def scrape_quotes():
url = "http://quotes.toscrape.com/"
response = requests.get(url)
if response.status_code != 200:
print("Failed to fetch page")
return
html = response.text
# 正则匹配所有名言区块
quotes_pattern = re.compile(
r'<div class="quote".*?>(.*?)</div>',
re.DOTALL
)
quotes = quotes_pattern.findall(html)
results = []
for quote in quotes:
# 提取名言文本
text_match = re.search(
r'<span class="text".*?>(.*?)</span>',
quote
)
text = text_match.group(1).strip() if text_match else "N/A"
# 提取作者
author_match = re.search(
r'<small class="author".*?>(.*?)</small>',
quote
)
author = author_match.group(1).strip() if author_match else "N/A"
# 提取标签
tags = re.findall(
r'<a class="tag".*?>(.*?)</a>',
quote
)
results.append({
"text": text,
"author": author,
"tags": tags
})
# 打印结果
for i, result in enumerate(results, 1):
print(f"Quote {i}:")
print(f" Text: {result['text']}")
print(f" Author: {result['author']}")
print(f" Tags: {', '.join(result['tags'])}\n")
if __name__ == "__main__":
scrape_quotes()
结果显示:
bs4模块
爬虫【唯美图片】唯美意境图片_唯美图片大全_好看的图片 唯美 - 优美图库https://www.umei.cc/weimeitupian/
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin
# 创建图片文件夹(如果不存在)
img_dir = "img"
if not os.path.exists(img_dir):
os.makedirs(img_dir)
headers = {
"User-Agent": "Mozilla/5.0"
}
# 已经提取到的子页面链接(你可以替换成真实链接列表)
page_urls = [
"https://www.umei.cc/weimeitupian/oumeitupian/331451.htm",
"https://www.umei.cc/weimeitupian/oumeitupian/331450.htm",
# 可继续添加更多链接
]
for index, page_url in enumerate(page_urls):
try:
print(f"正在处理第 {index + 1} 个子页面: {page_url}")
resp = requests.get(page_url, headers=headers, timeout=10)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 有些页面在 .big-pic 或 .content 或 .image 或 #img 都可能出现图片
img_tags = soup.find_all("img")
img_count = 0
for img_tag in img_tags:
img_url = img_tag.get("src") or img_tag.get("data-original")
if img_url and img_url.startswith("http") and ".jpg" in img_url:
img_count += 1
filename = os.path.join(img_dir, f"page{index + 1}_{img_count}.jpg")
# 下载图片
img_data = requests.get(img_url, headers=headers).content
with open(filename, 'wb') as f:
f.write(img_data)
print(f"已保存图片:{filename}")
if img_count == 0:
print("⚠️ 没有找到有效图片")
time.sleep(1) # 防止爬得太快被封
except Exception as e:
print(f"❌ 处理页面 {page_url} 时出错:{e}")
print("🎉 所有子页面处理完成!")
结果显示: