基础版本
爬取网页后直接将前端html代码不加处理的输出
# pip3 install requests
import requests
# request the target URL
def crawler():
response = requests.get("https://www.scrapingcourse.com/ecommerce/")
response.raise_for_status()
print(response.text)
# execute the crawler
crawler()
无限增生的爬虫
从第一个链接开始,记录已经遍历过的链接;
并且从这个链接爬取的html代码中记录 a[href] 的链接,存储到将要遍历的列表;
对于已经爬取的链接,直接continue处理
# pip3 install requests
import requests
def crawler():
while urls_to_visit:
# get the page to visit from the list
current_url = urls_to_visit.pop(0)
print(current_url)
if current_url in visited_urls:
continue
# 记录访问过的url到列表中
visited_urls.add(current_url)
try:
response = requests.get(current_url, timeout=5) # 设置超时时间,避免死循环
response.raise_for_status() # 检查请求是否成功
except requests.RequestException as e:
print(f"请求失败: {current_url}, 错误: {e}")
continue
# parse the HTML
soup = BeautifulSoup(response.text, "html.parser")
# collect all the links
link_elements = soup.select("a[href]")
for link_element in link_elements:
url = link_element["href"]
if url.startswith("#"):
continue # ignore internal links
# convert links to absolute URLs
if not url.startswith("http"):
absolute_url = requests.compat.urljoin(target_url, url)
else:
absolute_url = url
# ensure the crawled link belongs to the target domain and hasn't been visited
if (
absolute_url.startswith(target_url)
and absolute_url not in urls_to_visit
):
urls_to_visit.append(url)
# pip3 install requests beautifulsoup4
from bs4 import BeautifulSoup
target_url = "https://www.scrapingcourse.com/ecommerce/"
# initialize the list of discovered URLs
urls_to_visit = [target_url]
visited_urls = set() # 记录已访问的 URL,防止重复爬取
# execute the crawler
crawler()
无限增生的效果
部分链接爬取失败后会返回错误信息