爬虫案例学习6

发布于:2024-12-19 ⋅ 阅读:(18) ⋅ 点赞:(0)

获取淘宝商品数据2024-12-18

参考学习:
大佬博客
视频教程
通过搜索发现,数据是通过发送请求过来的,不是静态存在源代码的
在这里插入图片描述
所以我们需要请求这个接口获取数据:比如标题,价格,图片等信息
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/
但是我们直接发请求,携带上参数,无法获取到数据,会返回非法请求的字样。
因为有个参数sign是加密的,我们需要逆向

逆向参数获取sign

sign参数:貌似是一些参数经过哈希加密算法之后生成的32位小写加密参数。
具体的需要查看对应的js
点击main.js
在这里插入图片描述
在这里插入图片描述
搜素sign:相关的,分析
在这里插入图片描述
eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)这一行就是生成sign
在这里插入图片描述
点击断点,可以查看变量的值
不过这里推荐打断点的时机,先鼠标滚动到下面的页码处,再接着打断点,点击下一页,此时进入js源码的参数才是正确的。
如果不这样做,鼠标滚轮下滑也进入了断点,ep.data的值不是我们需要的,需要放行很多次。
在这里插入图片描述
在这里插入图片描述
切换到控制台,输出这些值,等一下在python代码中需要使用,这里先记录一下

eE(em.token + "&" + eC + "&" + eS + "&" + ep.data) # 返回值是sign的值
em.token
eC 
eS 
ep.data

在这里插入图片描述

获得sign 8a3593958c55ff4115e359745dc9a665是0-9,a-f MD5加密的字符串
我们需要带代码中生成sign

构建字符串MD5加密

#构建字符串str = em.token + "&" + eC + "&" + eS + "&" + ep.data
#Ec是时间
def getSign(eC):
    em = 'cbee62bc9b064d508514dd6eb1c6cebd' # em变量存储token
    eS = '12574478'
    # signParam 是ep.data中的params字段
    signParam = {
	"device": "HMA-AL00",
	"isBeta": "false",
	"grayHair": "false",
	"from": "nt_history",
	"brand": "HUAWEI",
	"info": "wifi",
	"index": "4",
	"rainbow": "",
	"schemaType": "auction",
	"elderHome": "false",
	"isEnterSrpSearch": "true",
	"newSearch": "false",
	"network": "wifi",
	"subtype": "",
	"hasPreposeFilter": "false",
	"prepositionVersion": "v2",
	"client_os": "Android",
	"gpsEnabled": "false",
	"searchDoorFrom": "srp",
	"debug_rerankNewOpenCard": "false",
	"homePageVersion": "v7",
	"searchElderHomeOpen": "false",
	"search_action": "initiative",
	"sugg": "_4_1",
	"sversion": "13.6",
	"style": "list",
	"ttid": "600000@taobao_pc_10.7.0",
	"needTabs": "true",
	"areaCode": "CN",
	"vm": "nw",
	"countryNum": "156",
	"m": "pc",
	"page": 2,
	"n": 48,
	"q": "%E8%A3%A4%E5%AD%90",
	"qSource": "url",
	"pageSource": "",
	"tab": "all",
	"pageSize": "48",
	"totalPage": "100",
	"totalResults": "137306",
	"sourceS": "0",
	"sort": "_coefp",
	"bcoffset": "-13",
	"ntoffset": "13",
	"filterTag": "",
	"service": "",
	"prop": "",
	"loc": "",
	"start_price": None,
	"end_price": None,
	"startPrice": None,
	"endPrice": None,
	"categoryp": "",
	"ha3Kvpairs": None,
	"couponFilter": 0,
	"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}

在这里插入图片描述
json在线格式化
在这里插入图片描述
复制到python的函数的signParam字典中,将null值修改为None
接着继续完善getSign函数的MD5加密工作
import hashlib

 n = json.dumps(signParam)
    # print(json.dumps(json.dumps(signParam)))
    data = {
        "appId": "34385",
        "params": n
    }
    # print(data)
    n_data = json.dumps(data).replace(" ", "")
    eC= "1734492057250" # 时间戳
    str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
    # print(str)
    MD5 = hashlib.md5()
    MD5.update(str.encode("utf-8"))
    sign = MD5.hexdigest()
    return sign,n_data

调用函数,获取签名sign,上面的时间戳我是写死静态可,可以删除,改为动态的,
等一下在完整源码中会修改为动态当前时间戳

date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
# print(sign)
# f94586b665e0d865a20aa6d3acf708f3

有了sign,就可以发起请求,获取数据了,直接上完整源码
请求数据所在的api接口
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/

完整源码

# 可以运行版
# 获取淘宝数据:https://s.taobao.com/
# 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
import csv
import time

import requests
from pprint import pprint
import hashlib
import json
import re
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Referer": "https://s.taobao.com/",
    "cookie":"自己的cookie"
}
"""
 mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]
 sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
"""
# eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
def getSign(eC):
    em = 'db1e1adce046132af55f1e37728ca39b'
    eS = '12574478'
    signParam = {
	"device": "HMA-AL00",
	"isBeta": "false",
	"grayHair": "false",
	"from": "nt_history",
	"brand": "HUAWEI",
	"info": "wifi",
	"index": "4",
	"rainbow": "",
	"schemaType": "auction",
	"elderHome": "false",
	"isEnterSrpSearch": "true",
	"newSearch": "false",
	"network": "wifi",
	"subtype": "",
	"hasPreposeFilter": "false",
	"prepositionVersion": "v2",
	"client_os": "Android",
	"gpsEnabled": "false",
	"searchDoorFrom": "srp",
	"debug_rerankNewOpenCard": "false",
	"homePageVersion": "v7",
	"searchElderHomeOpen": "false",
	"search_action": "initiative",
	"sugg": "_4_1",
	"sversion": "13.6",
	"style": "list",
	"ttid": "600000@taobao_pc_10.7.0",
	"needTabs": "true",
	"areaCode": "CN",
	"vm": "nw",
	"countryNum": "156",
	"m": "pc",
	"page": 1,
	"n": 48,
	"q": "%E8%A3%A4%E5%AD%90",
	"qSource": "url",
	"pageSource": "",
	"tab": "all",
	"pageSize": "48",
	"totalPage": "100",
	"totalResults": "5000",
	"sourceS": "48",
	"sort": "_coefp",
	"bcoffset": "-26",
	"ntoffset": "0",
	"filterTag": "",
	"service": "",
	"prop": "",
	"loc": "",
	"start_price": None,
	"end_price": None,
	"startPrice": None,
	"endPrice": None,
	"categoryp": "",
	"ha3Kvpairs": None,
	"couponFilter": 0,
	"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}
    n = json.dumps(signParam)
    # print(json.dumps(json.dumps(signParam)))
    data = {
        "appId": "34385",
        "params": n
    }
    # print(data)
    n_data = json.dumps(data).replace(" ", "")
    str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
    # print(str)
    MD5 = hashlib.md5()
    MD5.update(str.encode("utf-8"))
    sign = MD5.hexdigest()
    return sign,n_data

date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
print(sign)
params = {
'jsv': '2.7.4',
'appKey': '12574478',
't': date_time,
'sign': sign,
'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
'v': '2.0',
'timeout': '10000',
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp6',
'data': n
}
resp = requests.get(url,params=params, headers=headers)
# print(resp.text)
html = resp.text
# 采集数据
info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')','')
# pprint(info)
jsonData = json.loads(info)
# 循环获取数据
with open('taobao.csv',mode="w",newline='',encoding="utf-8") as f:
    writer = csv.writer(f)
    # 写入表头
    head = ['标题','图片链接','价格','地区','销量','店铺']
    writer.writerow(head)
    for item in jsonData['data']['itemsArray']:
        dit = {
            'title': item['title'].replace('<span class=H>', '').replace('</span>',''),
            'img': item['pic_path'],
            'price': item['price'],
            'procity': item['procity'],
            'realSales': item['realSales'],
            'shopName': item['nick'],
        }
        writer.writerow(dit.values())
        print(dit)



在这里插入图片描述
在这里插入图片描述

注:需要获取其他数据

修改源码几个参数
url所在浏览器位置
在这里插入图片描述
改Referer和cookie
在这里插入图片描述

重写getSign函数的em值,eS值,signParam值
data中的appid也改
修改真正数据接口的参数:params
在这里插入图片描述
最后运行代码,即可获取数据
在这里插入图片描述