python基础21（2025.6.28）_全栈爬取（Linkexctrator）_车168以及诗词名句案例

发布于：2025-06-30 ⋅ 阅读:(15) ⋅ 点赞:(0)

案例一：诗词名句案例：（代码包里）

创建项目命令：（注意这里的创建和之前的创建的不一样）
scrapy startproject qichezhijia
scrapy genspider -t crawl 项目名域名

案例二：二手车

一、che168案例解析器代码

class Che168Spider(CrawlSpider):
    name = "che168"
    allowed_domains = ["che168.com"]
    start_urls = ["https://www.che168.com/china/list/#pvareaid=105575"]
    
      # 页面上可能会出现的所有标签. 全部记录在案, 做关系映射
      temp = {
		"表显里程": "li_cheng",
		"上牌时间":"start_time",
		 "挡位/排量": "pai_liang",
        "车辆所在地": "location",
        "查看限迁地": "guo_biao",
		}
	   rules=  (
	      # 一步到位, 直接干到各个详情页
        #                                               deny_domains干掉广告的域名
		 Rule(LinkExtractor(restrict_xpaths=("//ul[@class='viewlist_ul']/li",), deny_domains=("topicm.che168.com", )), callback="parse_item"),
        Rule(LinkExtractor(restrict_xpaths=("//div[@id='listpagination']",)), follow=True)
		
	    )	



	   def parse_item(self, resp):
          print(resp.url)
        """
        负责, 详情页的解析
        """
          name = resp.xpath("//h3[@class='car-brand-name']/text()").extract_first()
          li_list = resp.xpath("//ul[@class='brand-unit-item fn-clear']/li")

        # 为了保证数据格式的完整. 提前初始化
          item = {
            "li_cheng": "未知",
            "start_time": "未知",
            "pai_liang": "未知",
            "location": "未知",
            "guo_biao": "未知",
          }

          for li in li_list:
            sm_title = "".join(li.xpath("./p//text()").extract()).replace(" ", "")  # 上牌时间, 挡位 / 排量
            sm_value = li.xpath("./h4/text()").extract_first().replace(" ", "")

            key = Che168Spider.temp.get(sm_title)  # 动态的进行匹配.
            item[key] = sm_value

          print(item)
        # # 找到有问题的那一个
        # if item['li_cheng'] =='未知':
        #     print(resp.url)