爬虫日常实战

发布于:2024-11-03 ⋅ 阅读:(9) ⋅ 点赞:(0)

1.爬虫爬取污染(含内部框架frame)

此页面有反调试操作,简单说一下,就是鼠标右键检查打不开控制台,如果看过我之前的博客就知道用 F12 或者 ctrl+shift+i 解决,同时页面会debugger断住,看过我之前博客的应该知道怎么解决,最简单直接暴力的方法是 停用所有断点 ,方法二是使用置空和替换的思想进行操作

方法二反调试:

查找debugger位置:

进行置空和替换之后就没有反调试暂停了。

 

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# 初始化 Chrome 驱动
driver = webdriver.Chrome()

# 打开目标网站
driver.get('https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html')

# 等待页面加载
time.sleep(5)

# 切换到内部 iframe
driver.switch_to.frame(driver.find_element(By.XPATH, '//*[@id="MF"]'))

# 点击按钮,加载动态内容
area_button = driver.find_element(By.XPATH, '//*[@id="ddm_Area"]/span')
area_button.click()

# 等待页面加载
time.sleep(2)

# 点击动态加载后的链接
link = driver.find_element(By.XPATH, '//*[@id="head_filter"]/div[1]/div/ul/li[1]/a')
link.click()

# 等待数据加载
time.sleep(45)

# 滑动到页面底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# 等待加载完成
time.sleep(2)

# 获取城市名和检测时间
cities = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[1]/span')
detection_times = driver.find_elements(By.XPATH, '//*[@id="gridDatas"]/li/table/tbody/tr/td[4]')

# 打印结果
for city, detection_time in zip(cities, detection_times):
    print(f"城市名: {city.text}, 检测时间: {detection_time.text}")

# 切回主文档
driver.switch_to.default_content()

# 关闭驱动
driver.quit()

2.代理池(89代理),selenium方法

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests

sum = 0
proxy_arr = []
driver = webdriver.Chrome()

for num in range(1, 3):
    url = f'https://www.89ip.cn/index_{num}.html'
    driver.get(url)
    time.sleep(3)
    proxy_1 = driver.find_elements(By.XPATH, '//tbody/tr/td[1]')
    proxy_2 = driver.find_elements(By.XPATH, '//tbody/tr/td[2]')
    for proxy_11, proxy_22 in zip(proxy_1, proxy_2):
        proxy_temp = f"{proxy_11.text}:{proxy_22.text}"
        proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}
        try:
            response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5, verify=False)
            if response.status_code == 200:
                sum += 1
                proxy_arr.append(proxy_temp)
        except:
            continue

proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)

driver.quit()

3.代理池(89代理),requests方法

import requests
import time
from lxml import etree
import random

sum = 0
proxy_arr = []
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    'cookie':'Hm_lvt_f9e56acddd5155c92b9b5499ff966848=1730287104,1730429940; HMACCOUNT=F89A075820B0EAF1; Hm_lpvt_f9e56acddd5155c92b9b5499ff966848=1730429996; https_waf_cookie=e5758711-3596-4a395e556275bcab68c58983f0d9a2ba341d; https_ydclearance=d85affb1d0d3d6275303f71f-b44c-4b4e-a087-cd1232c7e338-1730437813'
}
for num in range(1, 3):
    url = f'https://www.89ip.cn/index_{num}.html'
    response = requests.get(url, headers=headers)
    time.sleep(3)
    html = etree.HTML(response.text)
    proxy_ips = html.xpath('//tbody/tr/td[1]/text()')
    proxy_ports = html.xpath('//tbody/tr/td[2]/text()')
    for ip, port in zip(proxy_ips, proxy_ports):
        proxy_temp = f"{ip.strip()}:{port.strip()}"
        proxies = {"http": f"http://{proxy_temp}", "https": f"http://{proxy_temp}"}
        print(proxies)
        try:
            response = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=5)
            if response.status_code == 200:
                sum += 1
                proxy_arr.append(proxy_temp)
        except Exception as e:
            continue

# 输出结果
proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)

4.代理池(快代理)

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests

sum = 0
proxy_arr = []
driver = webdriver.Chrome()

for num in range(1, 3):
    url = f'https://www.kuaidaili.com/free/inha/{num}/'
    driver.get(url)
    time.sleep(3)
    heads = driver.find_elements(By.XPATH,'//*[@id="table__free-proxy"]/div/table/tbody/tr/td[4]')
    proxy_1 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[1]')
    proxy_2 = driver.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[2]')
    for head,proxy_11, proxy_22 in zip(heads,proxy_1, proxy_2):
        head1 = head.text.lower()
        proxy_temp = f"{head1}://{proxy_11.text}:{proxy_22.text}"
        print(proxy_temp)
        try:
            proxies = {head.text:proxy_temp}
            # 测试 HTTP 请求的服务。它的 ip 路径将返回你的 IP 地址,帮助确认请求是否通过代理成功。
            response = requests.get("http://httpbin.org/ip", proxies=proxies,timeout=5, verify=False)
            if response.status_code == 200:
                sum += 1
                proxy_arr.append(proxy_temp)

        except:
            continue

proxy = random.choice(proxy_arr) if proxy_arr else "没有可以访问的proxies"
print("一共成功可以访问的代理个数:", sum)
print("随机选择的代理:", proxy)

driver.quit()

5.爬虫搜狗多页

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome()
driver.get('https://weixin.sogou.com/pcindex/')
search_box = driver.find_element(By.XPATH, '//*[@id="query"]')
search_box.send_keys('爬虫')
time.sleep(2)
search_button = driver.find_element(By.XPATH, '//*[@id="searchForm"]/div/span[2]/input')
search_button.click()
time.sleep(5)

# 循环爬取10页
for page in range(10):
    print(f"正在爬取第 {page + 1} 页...")
    for num in range(10):
        try:
            title = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_title_{num}"]').text
            name = driver.find_element(By.XPATH, f'//*[@id="sogou_vr_11002601_box_{num}"]/div[2]/div/span[1]').text
            print(f"标题: {title}, 名称: {name}")
        except Exception as e:
            print(f"提取第 {num + 1} 条数据时发生错误: {e}")
            continue

    try:
        next_button = driver.find_element(By.XPATH, '//*[@id="sogou_next"]')
        next_button.click()
        time.sleep(5)
    except Exception as e:
        print("没有找到下一页,或发生错误:", e)
        break

driver.quit()