Python爬虫网安-urllib+示例

发布于:2025-07-02 ⋅ 阅读:(22) ⋅ 点赞:(0)

目录

4个模块:

最基本的请求:

爬取一个百度网页

get请求拼接

get请求拼接多个参数

成功

post请求(data,timeout):

User-Agent

自定义User-Agent

随机User-Agent

handler:

自定义opener

自定义ip代理

验证:

​编辑

授权认证

Authorization验证

cookiejar读写cookie:

URLerror

url链接组成

urlparse&urlunparse

urlsplit&urlunsplit

urljoin

parse_qs&parse_qsl

quote&unquote

robot协议


urllib:

python内置的http请求库

4个模块:

request模块:最基本的http请求模块

error模块:异常处理模块

parse模块:工具模块 提供url的处理方法

robotparser模块:识别robot.txt

最基本的请求:

使用openurl发送

get请求:

带参数直接在url上面拼接

参数可能不止一个

多参数:urllib.parse.urlencode(params)

基本的URL地址和一个包含查询参数的字典params

使用urllib.parse.urlencode()函数将查询参数编码为查询字符串

post请求:

添加data(可选)

爬取一个百度网页

#!/usr/bin/env python3

import urllib.request

def load_baidu_data():
    url = 'http://www.baidu.com'
    response =urllib.request.urlopen(url)
    data=response.read()
    str_data = data.decode('utf-8')
    with open("bd.html","w",encoding='utf-8') as f:
        f.write(str_data)

load_baidu_data()

get请求拼接

代码实现百度搜索柯南

#!/usr/bin/env python3

import urllib.request
import urllib.parse
import string
def load_baidu_data():
    url = 'http://www.baidu.com/s?wd='
    #url+search
    name = "柯南"

    final_url = url+name
    #网址中有汉字,需要进行转码
    encode_url = urllib.parse.quote(final_url,safe=string.printable)
    # print(encode_url)
    response = urllib.request.urlopen(encode_url)
    data = response.read()
    #将data获取到的东西转换为字符串
    str_data =data.decode('utf-8')
    # print(str_data)
    with open("baidu-kenan.html","w",encoding="utf-8") as f:
        f.write(str_data)
load_baidu_data()

get请求拼接多个参数

代码实现百度搜索柯南第九页,和和上一个步骤一样先观察url

#!/usr/bin/env python3

import urllib.request
import urllib.parse
import string
def load_baidu_data():
    url = 'http://www.baidu.com/s?'
    #url+search
    params ={
        "wd":"柯南",
        "pn":"80"
    }
    query_str = urllib.parse.urlencode(params)
    final_url = url+query_str
    print(final_url)
    response = urllib.request.urlopen(final_url)
    data = response.read()
    #将data获取到的东西转换为字符串
    str_data =data.decode('utf-8')
    # print(str_data)
    with open("baidu-kenan-pn80.html","w",encoding="utf-8") as f:
        f.write(str_data)
load_baidu_data()

成功

post请求(data,timeout):

#!/usr/bin/env python

import urllib.request
import urllib.parse
import urllib.error

#1.定义url(自己找一个url)
url = 'http://www.baidu.com/post'

#创建要发送的数据表单
data = {
    'hello':'world',
    'name':'kenan'
}

#data要进行编码
data_encode = urllib.parse.urlencode(data).encode("utf-8")

#加上encode(“utf-8”)    str -》bytes
#解码 decode("utf-8")     byte -》str

try:
    response = urllib.request.urlopen(url=url,data=data_encode,timeout=0.1)
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
    print("连接超时!")

User-Agent

自定义User-Agent

#!/usr/bin/env python

import urllib.request
import urllib.parse
import urllib.error

#1.定义url
url = 'http://www.baidu.com/post'

#2自定义request  添加一个user-agent
header = {
    "User-Agent":"Mozilla/5.0 (Linux; U; Android 11; zh-CN; 2112123AC Build/RKQ1.200826.002) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/100.0.4896.58 Quark/6.2.2.246 Mobile Safari/537.36"
}

req = urllib.request.Request(url=url,headers=header,method='POST')
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

随机User-Agent

#!/usr/bin/env python

import urllib.request
import urllib.parse
import urllib.error
import random

def user_agent():
    url = 'http://www.baidu.com/post'
    #找一堆user-agent
    user_agent_list = [
        "Mozilla/5.0 (Linux; Android 12; ELS-AN00 Build/HUAWEIELS-AN00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/107.0.5304.141 Mobile Safari/537.36 XWEB/5075 MMWEBSDK/20230504 MMWEBID/9308 MicroMessenger/8.0.37.2380(0x2800253D) WeChat/arm64 Weixin NetType/5G Language/zh_CN ABI/arm64 MiniProgramEnv/android",
        "Mozilla/5.0 (iPhone; CPU iPhone OS............ile/15E148 MicroMessenger/8.0.34(0x18002234) NetType/4G Language/zh_CN",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; hu; rv:1.8.0.11) Gecko/20070312 Firefox/1.5.0.1120",
        "Mozilla/5.0 (Macintosh; Int............ecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67",
        "Mozilla/5.0 (X1............7.6) Gecko/20050318 Firefox/1.0.2",
        "Mozilla/5.0 (Windows; U; Win............o) Chrome/100.0.4896.58 Safari/537.36 UCBrowser/15.4.8.1238"
    ]
    #每次取一个user-agent
    random_user_agent = random.choice(user_agent_list)
    header = {
        "User-Agent":random_user_agent
    }

    req = urllib.request.Request(url=url,headers=header,method='POST')
    response = urllib.request.urlopen(req)
    print(response.read().decode("utf-8"))

user_agent()    

handler:

HTTPDefaultErrorHandler 用于处理 HTTP 响应错误,错误都会抛出 HTTPError 类型的异常。

HTTPRedirectHandler 用于处理重定向。

HTTPCookieProcessor 用于处理 Cookies。

ProxyHandler 用于设置代理,默认代理为空。

HTTPPasswordMgr 用于管理密码,它维护了用户名密码的表。

HTTPBasicAuthHandler 用于管理认证,如果一个链接打开时需要认证,那么可以用它来解决认证问题。

自定义opener

#!/usr/bin/env python

import urllib.request
import urllib.parse
import urllib.error

def handler_open():
    url = 'http://www.baidu.com/get'
    #创建自己的opener
    handler = urllib.request.HTTPHandler()
    opener = urllib.request.build_opener(handler)
    response = opener.open(url)
    print(response.read().decode("utf-8"))
    
handler_open()    

自定义ip代理

#!/usr/bin/env python

import urllib.request
import urllib.parse
import urllib.error

def handler_open():
    try:
        url = 'http://httpbin.org/get'
    #添加代理 代理的ip 端口
        proxy = {
            "http":"http://192.168.6.6:8888"
        }
        #创建代理处理器
        proxy_handler = urllib.request.ProxyHandler(proxy)
        #创建自己的opener
        opener = urllib.request.build_opener(proxy_handler)
        response = opener.open(url)
        print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        print("error: ",e)    
    
handler_open()    

随机ip代理

#!/usr/bin/env python

import urllib.request
import urllib.parse
import urllib.error
import random

def proxy_ip():
    url = 'https://www.kuaidaili.com/testproxy'
    #找一堆代理ip
    ip_list = [
        "http://183.161.45.66:17114",
        "http://119.41.198.172:18350",
        "http://27.191.60.244:15982",
        "http://27.215.237.221:20983",
    ]
    #每次取一个ip
    proxy = random.choice(ip_list)
    print(proxy)
    try:
        #创建代理处理器
        proxy_handler = urllib.request.ProxyHandler({'http':proxy,'https':proxy})
        #创建自己的opener
        opener = urllib.request.build_opener(proxy_handler)
        response = opener.open(url)
        print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        print("error: ",e)    

proxy_ip()  

验证:

HTTPBasicAuthHandler(用户基本的身份验证处理)

HTTPPasswordMgrWithDefaultRealm(经常和authhandler一起出现)

    #创建一个密码管理器

    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    #添加进目标url,用户名 密码

    password_mgr.add_password(None,url,username,password)

第一个参数为NONE,表示默认的域

如果需要添加不同域的凭证可以将none替换为对应的域名

https://ssr3.scrape.center示例网站https://ssr3.scrape.center

授权认证

#!/usr/bin/env python

import urllib.request

from urllib.parse import urlparse

def auth_login():
    url = 'https://ssr3.scrape.center/'
    #指定用户名and密码
    username = 'admin'
    password = 'admin'
    #创建一个密码管理器
    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    #添加进目标url,用户名和密码
    password_mgr.add_password(None,url,username,password)
    #创建一个基本密码认证处理器并将密码管理器传递给他
    handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
    #创建网络请求的构造器
    opener = urllib.request.build_opener(handler)
    response = opener.open(url)
    # response = urllib.request.urlopen(url)
    print(response.read().decode('utf-8'))
    
auth_login()    

Authorization验证

#!/usr/bin/env python

import urllib.request

from urllib.parse import urlparse

def auth_login():
    url = 'https://ssr3.scrape.center/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
        'Authorization':'Basic YWRtaW46YWRtaW4='
    }
    req = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))
    
auth_login()    
    

cookiejar读写cookie:

处理cookie相关的handler 

写:

MozillaCookieJar:
可以将cookies保存成Mozilla型浏览器的cookies格式

#!/usr/bin/env python

import urllib.request
import http.cookiejar

# cookie = http.cookiejar.CookieJar()
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open('http://www.baidu.com')
#
# for item in cookie:
#     print(item)

'''把打印出来的cookie存到一个文件里'''
filename = 'ccc.txt'
cookie = http.cookiejar.MozillaCookieJar(filename=filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)

读:

LWPCookieJar
保存成 libwww-perl(LWP) 格式cookies文件。

#!/usr/bin/env python

import urllib.request
import http.cookiejar


filename = 'ccc.txt'

cookie = http.cookiejar.LWPCookieJar()
cookie.load(filename,ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

使用cookie登录网页

#!/usr/bin/env python

import urllib.request
import http.cookiejar
import urllib.parse

#1.用账号密码登录网站

#登录的url
url = 'https://www.yaozh.com/login'
#登录的参数
login_data = {
    "type":"0",
    "username":"ppkke007",
    "pwd":"Iceropq13315",
    "pc_vcode":"",
    "country":"86_zh-CN",
    "mobile":"",
    "vcode":"",
    "pincode":"",
    "formhash":"CEA7846B38",
    "backurl":"https%253A%252F%252Fwww.yaozh.com%252F"
}
#发送登录请求
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.54',
}
#转码
login_str = urllib.parse.urlencode(login_data).encode('utf-8')

req = urllib.request.Request(url=url,headers=headers,data=login_str)
#如果登录成功 cookjar会自动保存cookie
opener.open(req)

#2.代码带着cookie去访问
login_url = "https://www.yaozh.com/member/"
req2 = urllib.request.Request(login_url,headers=headers)
response = opener.open(login_url)
# response = urllib.request.urlopen(login_url)
# print(response.read().decode('utf-8'))
data = response.read()
with open('cookie2.html',"wb") as f:
    f.write(data)

URLerror

urllib的error模块:
urlerror继承自OSError

except error.URLError as e:
    print(e.reason)   

打印错误的原因

HTTPError:
专门用来处理HTTP请求

#!/usr/bin/env python

import urllib.request
from urllib.error import *
import socket

# try:
#     url = 'https://www.baidu.com'
#     response = urllib.request.urlopen(url=url,timeout=0.01)
# except URLError as e:
#      print(e.reason)
#      if isinstance(e.reason,socket.timeout):
#          print("Time out!!")



try:
    url = 'https://ssr3.scrape.center/asdasd'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
    }
    req = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))

# except URLError as e:
#     print(e.reason)

except HTTPError as e:
      print("http error:",e)

url链接组成

urlparse
是python标准库中的一个模块,解析和操作url

(这是一个标准的url链接格式)
scheme://netloc/path;params?query#fragment

scheme(协议) http or https
netloc(网络位置) host
path(路径)
params(携带的参数)
query(查询参数)
fragment(片段) 内部导航

urlparse&urlunparse

#!/usr/bin/env python

from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote


url = 'http://www.baidu.com/index.html;user?id=0#comment'

result = urlparse(url=url)
# print(type(result),result)
print(result.scheme)
print(result.netloc)
print(result.path)
print(result.params)
print(result.query)
print(result.fragment)

result = urlparse(url=url,scheme='https',allow_fragments=False)
print(result.scheme)
print(result.fragment)

data = ['http','www.baidu.com','index.html','user','id=0','comment']
url = urlunparse(data)
print(url)

#http://www.baidu.com/index.html;user?id=0#comment

urlsplit&urlunsplit

和urlparse不同的是urlsplit不解析查询参数和片段部分,只分割url
返回的是一个元组类型,可以用索引来取

#!/usr/bin/env python

from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote


urlsplit
url = 'http://www.baidu.com/index.html;user?id=0#comment'
result = urlsplit(url)
print(result.scheme)
print(result[0])
print(result.netloc)
print(result.path)
print(result.query)
print(result.fragment)

urlunsplit

data = ('http','www.baidu.com','index.html','id=0','comment')

print(urlunsplit(data))

urljoin

将一个相对URL 解析成一个绝对的url
base 基本url,通常是一个绝对的url
url,这个是相对的url

#!/usr/bin/env python

from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote


urljoin
base_url = 'https://www.baidu.com'
relative_url = '/path/to/xxxx'
url = urljoin(base_url,relative_url)
print(url)

print(urljoin('https://www.baidu.com','/FAQ.html'))
print(urljoin('https://www.baidu.com','http://www.taobao.com'))
print(urljoin('https://www.baidu.com/admin.html','http://www.taobao.com'))
print(urljoin('https://www.baidu.com?wd=aaa','?user=1#comment'))
print(urljoin('https://www.baidu.com#comment','?user=1'))

结论:base_url 提供了三项内容 scheme、netloc 和 path。如果这 3 项在新的链接里不存在,就予以补充;如果新的链接存在,就使用新的链接的部分。而 base_url 中的 params、query 和 fragment 是不起作用的。

parse_qs&parse_qsl

parse_qs

get参数的序列化

parse_qsl
将参数转为元组组成列表

#!/usr/bin/env python

from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote


parse_qs
query = 'name=kenan&age=16'
print(parse_qs(query))

parse_qsl
data = parse_qs(query)
print(parse_qsl(query))

quote&unquote

quote
url编码

unquote
url解码

#!/usr/bin/env python

from urllib.parse import urlparse,urlunparse,urlsplit,urlunsplit,urljoin,parse_qs,parse_qsl,quote,unquote


keyword = "柯南"
url = 'https://www.baidu.com/s?wd='+quote(keyword)
print(url)

#unquote
url_1 = unquote(url)
print(url_1)

robot协议

爬虫协议
告诉爬虫or搜索引擎,哪些可以爬 哪些不能爬
robot.txt(一般就在根目录下)

robotparser模块
用于解析robots.txt

#!/usr/bin/env python

from urllib.robotparser import RobotFileParser

#创建一个RobotFileParser对象用于解析robots.txt
robot_parser = RobotFileParser()
robot_parser.set_url('https://www.zhihu.com/robots.txt')

#读取并且解析robots.txt
robot_parser.read()

#检查是否可以爬取特定的url
user_agent = "BaiduSpider"
check_url = 'https://www.zhihu.com/'

#can_fetch
if robot_parser.can_fetch(user_agent,check_url):
    print("可以爬取这个url")
else:
    print("不可以爬取这个url")


网站公告

今日签到

点亮在社区的每一天
去签到