Python爬虫系列-爬取驾考题目(和上期一样有Cookie验证,使用Selenium方式爬取,Python生成word文档)

发布于:2024-05-24 ⋅ 阅读:(74) ⋅ 点赞:(0)

如上效果图,通过https://kaocheche.com/ 这个小网站爬取。

使用前请先安装库:

pip install requests beautifulsoup4 python-docx pillow selenium webdriver-manager

 使用方法,修改下面的三个变量:

project_path = "E:\\" #设置word文档的保存目录

#.......此处省略代码.........

if __name__ == "__main__":
    docname = "2024年科目四考试题库" #设置word文档的文件名
    url = "https://kaocheche.com/tiku/kemu1/" #设置需爬取的考题目录的网址

可以爬取这个网站(  ​ https://kaocheche.com/  ​)的所有题目,包括货车、大小客车、摩托车、网约车、70岁老人三力测试。

具体代码如下:

import requests
from bs4 import BeautifulSoup as bs
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
from docx.shared import RGBColor
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

#word 文档的保存目录
project_path = "E:\\"

def get_problem(driver, url):# 爬取考题
    driver.get(url)
    try:
        elements = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,"anchor.anchorWithHideOnScrollNavbar_WYt5")))
        problem = elements.text.strip()
        print(problem)
        picture,option = get_Picture(driver)
        answer = driver.find_elements(By.CSS_SELECTOR,"h2.anchor.anchorWithHideOnScrollNavbar_WYt5")
        if len(answer) > 1:
            answer = answer[1].text.strip()
        else:
            answer = ""
        parser = get_parser(driver)
        print(problem + "\n" + picture + "\n" + option + "\n" + answer + "\n" + parser)
        return problem,picture,option,answer,parser
    except TimeoutError:
        print(url + " 网页超时!")
        return None

def get_Picture(driver):# 爬取考题图片
    picture_elements = driver.find_element(By.CLASS_NAME,"img_ev3q")
    picture = picture_elements.get_attribute("src")
    if picture is not None:
        res = requests.get(picture)
        filename = picture.split('/')[-1]
        picture_save_path = project_path + "picture\\" + filename
        with open(picture_save_path,"wb") as f:
            f.write(res.content)
            f.close()
            picture = picture_save_path
    return picture,get_option(driver)

def get_option(driver):# 爬取答案选项
    options = driver.find_elements(By.CSS_SELECTOR,"div.theme-doc-markdown.markdown ul li")
    options = [li.text for li in options]
    option = "\n".join(options)
    return option

def get_parser(driver):# 爬取答案解析
    try:
        parsers = driver.find_element(By.CSS_SELECTOR,"span.token.plain")
        return parsers.text.strip()
    except NoSuchElementException:
        return ""

def check_image(file_path):# 检查图片是否下载正确
    try:
        with Image.open(file_path) as img:
            img.verify()  # 尝试验证图片完整性
        return True
    except (IOError, SyntaxError) as e:
        print(f"损坏的图片:{file_path},错误信息:{e}")
        return False

def WriteDocx(doc, driver, url, starttime):# 按格式写入word文档
    problem,picture,option,answer,parser = get_problem(driver, url)

    if answer == "":
        print("----------------问题:",problem,"写入失败!----------------")
        return problem,0
    # 写入问题
    paragraph1 = doc.add_paragraph()
    run1 = paragraph1.add_run(problem + "\n")
    run1.font.size = Pt(20)
    run1.bold = True
    # 插入图片
    if picture != "" and picture != project_path + "picture\\wechat-2f1dfcbb45f7f4c3a823a3f3bbb22b9d.png":
        if check_image(picture):
            doc.add_picture(picture,width=Inches(4.0))
        else:
            print(picture,"图片损坏,写入失败!")

    # 写入选项和答案
    paragraph2 = doc.add_paragraph()
    run2 = paragraph2.add_run(option + "\n")
    run2.font.size = Pt(16)
    run3 =  paragraph2.add_run(answer + "\n")
    run3.font.size = Pt(18)
    run3.font.color.rgb = RGBColor(255,0,0)
    run3.bold = True
    run4 = paragraph2.add_run(parser)
    run4.font.size = Pt(16)
    # 添加换页符
    doc.add_page_break()
    endtime = time.time()
    return problem,endtime - starttime

def getUrl(doc, driver, MainUrl):# 获得题目目录中的所有考题链接
    res = requests.get(MainUrl)
    soup = bs(res.content,"html.parser")
    Urls = soup.select("a.card.padding--lg.cardContainer_fWXF")
    Urls = [Url['href'] for Url in Urls]#.get("href")

    for url in Urls:
        starttime = time.time()
        problem, runtime = WriteDocx(doc, driver, "https://kaocheche.com" + url, starttime)
        if runtime:
            print("----------------问题:",problem,f"写入成功!(耗时{runtime} 秒)----------------")

def main(docname,url):#主函数传入两个参数,word文档名和需爬取考题目录网址
    starttime = time.time()
    # 创建一个新的Word文档
    doc = Document()

    # 向文档添加一个标题
    head = doc.add_heading(docname)
    for run in head.runs:
        run.font.size = Pt(26)

    driver = webdriver.Chrome()

    getUrl(doc, driver, url)#"https://kaocheche.com/zige/sanli/"

    doc.save(project_path + docname + ".docx")

    endtime = time.time()
    print(f"程序运行成功!共耗时{(endtime - starttime) / 60} 分钟!")

if __name__ == "__main__":
    docname = "2024年科目四考试题库" #word文档的文件名
    url = "https://kaocheche.com/tiku/kemu1/" #需爬取的考题目录的网址
    #请替换这里的题目目录,可以爬取这个网站的所有题目,包括货车、客车、摩托车、网约车、70岁老人三力测试
    #所有考试的主目录在这https://kaocheche.com/
    main(docname,url)

这个爬虫很简单,主要是写入docx的方法可以学习下,其他Selenium的使用方法,前一篇文章已说明,就不废话了,如有不懂请留言给我,谢谢观看!

PS:望大家爬取时,加个延迟,减少小网站的负担,人家开个小网站也不容易。-_-!

如果只是需要题目和答案,我有现成的,过两天上传CSDN,会更新在这个帖子里。


网站公告

今日签到

点亮在社区的每一天
去签到