1.爬虫爬取污染(含内部框架frame)
此页面有反调试操作,简单说一下,就是鼠标右键检查打不开控制台,如果看过我之前的博客就知道用 F12 或者 ctrl+shift+i 解决,同时页面会debugger断住,看过我之前博客的应该知道怎么解决,最简单直接暴力的方法是 停用所有断点 ,方法二是使用置空和替换的思想进行操作
方法二反调试:
查找debugger位置:
进行置空和替换之后就没有反调试暂停了。
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 初始化 Chrome 驱动
driver = webdriver.Chrome()
# 打开目标网站
driver.get('https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html')
# 等待页面加载
time.sleep(5)
# 切换到内部 iframe
driver.switch_to.frame(driver.find_element(By.XPATH, '//*[@id="MF"]'))
# 点击按钮,加载动态内容
area_button = driver.find_element(By.XPATH, '//*[@id="ddm_Area"]/span')
area_button.click()
# 等待页面加载
time.sleep(2)
# 点击动态加载后的链接
link = driver.find_element(By.XPATH, '//*[@id="head_filter"]/div[1]/div/ul/li[1]/a')
link.click()
# 等待数据加载
time.sleep(45)
# 滑动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 等待加载完成
time.sleep(2)
# 获取城市名和检测时间
cities = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[1]/span')
detection_times = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[4]')
# 打印结果
for city, detection_time in zip(cities, detection_times):
print(f"城市名: {city.text}, 检测时间: {detection_time.text}")
# 切回主文档
driver.switch_to.default_content()
# 关闭驱动
driver.quit()
2.代理池(89代理),selenium方法
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests
sum = 0
proxy_arr = []
driver = webdriver.Chrome()
for num in range(1, 3):
url = f'https://www.89ip.cn/index_{num}.html'
driver.get(url)
time.sleep(3)
proxy_1 = driver.find_elements(By.XPATH, '//tbody/tr/td[1]')
proxy_2 = driver.find_elements(By.XPATH, '//tbody/tr/td[2]')
for proxy_11, proxy_22 in zip(proxy_1, proxy_2):
proxy_temp = f"{proxy_11.text}:{proxy_22.text}"
proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}
try:
response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5, verify=False)
if response.status_code == 200:
sum += 1
proxy_arr.append(proxy_temp)
except:
continue
proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)
driver.quit()
3.代理池(89代理),requests方法
import requests
import time
from lxml import etree
import random
sum = 0
proxy_arr = []
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
'cookie':'Hm_lvt_f9e56acddd5155c92b9b5499ff966848=1730287104,1730429940; HMACCOUNT=F89A075820B0EAF1; Hm_lpvt_f9e56acddd5155c92b9b5499ff966848=1730429996; https_waf_cookie=e5758711-3596-4a395e556275bcab68c58983f0d9a2ba341d; https_ydclearance=d85affb1d0d3d6275303f71f-b44c-4b4e-a087-cd1232c7e338-1730437813'
}
for num in range(1, 3):
url = f'https://www.89ip.cn/index_{num}.html'
response = requests.get(url, headers=headers)
time.sleep(3)
html = etree.HTML(response.text)
proxy_ips = html.xpath('//tbody/tr/td[1]/text()')
proxy_ports = html.xpath('//tbody/tr/td[2]/text()')
for ip, port in zip(proxy_ips, proxy_ports):
proxy_temp = f"{ip.strip()}:{port.strip()}"
proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}
print(proxies)
try:
response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5)
if response.status_code == 200:
sum += 1
proxy_arr.append(proxy_temp)
except Exception as e:
continue
# 输出结果
proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)
4.代理池(快代理)
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests
sum = 0
proxy_arr = []
driver = webdriver.Chrome()
for num in range(1, 3):
url = f'https://www.kuaidaili.com/free/inha/{num}/'
driver.get(url)
time.sleep(3)
heads = driver.find_elements(By.XPATH,'//*[@id="table__free-proxy"]/div/table/tbody/tr/td[4]')
proxy_1 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[1]')
proxy_2 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[2]')
for head,proxy_11, proxy_22 in zip(heads,proxy_1, proxy_2):
head1 = head.text.lower()
proxy_temp = f"{head1}://{proxy_11.text}:{proxy_22.text}"
print(proxy_temp)
try:
proxies = {head.text:proxy_temp}
# 测试 HTTP 请求的服务。它的 ip 路径将返回你的 IP 地址,帮助确认请求是否通过代理成功。
response = requests.get("http://httpbin.org/ip", proxies=proxies,timeout=5, verify=False)
if response.status_code == 200:
sum += 1
proxy_arr.append(proxy_temp)
except:
continue
proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)
driver.quit()
5.爬虫搜狗多页
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome()
driver.get('https://weixin.sogou.com/pcindex/')
search_box = driver.find_element(By.XPATH, '//*[@id="query"]')
search_box.send_keys('爬虫')
time.sleep(2)
search_button = driver.find_element(By.XPATH, '//*[@id="searchForm"]/div/span[2]/input')
search_button.click()
time.sleep(5)
# 循环爬取10页
for page in range(10):
print(f"正在爬取第 {page + 1} 页...")
for num in range(10):
try:
title = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_title_{num}"]').text
name = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_box_{num}"]/div[2]/div/span[1]').text
print(f"标题: {title}, 名称: {name}")
except Exception as e:
print(f"提取第 {num + 1} 条数据时发生错误: {e}")
continue
try:
next_button = driver.find_element(By.XPATH, '//*[@id="sogou_next"]')
next_button.click()
time.sleep(5)
except Exception as e:
print("没有找到下一页,或发生错误:", e)
break
driver.quit()