1。你是否还在为大模型的key而感到忧伤和囊中羞涩,openrouter.ai,目前可免费白嫖多个大模型,代码如下
from openai import OpenAI
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key="",
)
completion = client.chat.completions.create(
extra_headers={
"HTTP-Referer": "<YOUR_SITE_URL>", # Optional. Site URL for rankings on openrouter.ai.
"X-Title": "<YOUR_SITE_NAME>", # Optional. Site title for rankings on openrouter.ai.
},
extra_body={},
model="google/gemini-2.5-pro-exp-03-25:free",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
]
}
]
)
print(completion.choices[0].message.content)
2。然后是爬虫代码(下面这段代码就是实现的官网的功能Playground - Firecrawl)
# Install with pip install firecrawl-py(https://docs.firecrawl.dev/api-reference/endpoint/crawl-post)官网登录获得key
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key='')
response = app.scrape_url(url='https://docs.cline.bot/', params={
'formats': [ 'markdown' ],
})
print(response)
3。还有一个用上面免费的大模型来进行对网页的分析
import os
from firecrawl import FirecrawlApp
import json
import re
import requests
from requests.exceptions import RequestException
from dotenv import load_dotenv
from openai import OpenAI
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
# Initialize the FirecrawlApp and OpenAI client
app = FirecrawlApp(api_key=firecrawl_api_key)
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=openrouter_api_key,
)
model_name = "google/gemini-2.5-pro-exp-03-25:free"
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
def extract_urls_from_markdown(markdown_text):
pattern = r'(https?://[^\s\'")]+)'
found = re.findall(pattern, markdown_text)
return list(set(found))
def detect_mime_type(url, timeout=8):
try:
resp = requests.head(url, timeout=timeout, allow_redirects=True)
ctype = resp.headers.get('Content-Type', '').lower()
exts = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.heic', '.heif']
if 'pdf' in ctype:
return 'pdf'
elif ctype.startswith('image/') and any(url.lower().endswith(ext) for ext in exts):
return 'image'
else:
return None
except RequestException as e:
print(f"Warning: HEAD request failed for {url}. Error: {e}")
return None
def query_openai(content):
response = client.chat.completions.create(
extra_headers={
"HTTP-Referer": "https://your-site.com",
"X-Title": "YourSiteName",
},
model=model_name,
messages=[{"role": "user", "content": content}],
)
print("response",response)
return response.choices[0].message.content.strip()
def extract_json_from_response(text):
# 兼容含 markdown 和纯 json
match = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
if not match:
match = re.search(r"```(.*?)```", text, re.DOTALL) # 兜底,兼容没有指定json的codeblock
if match:
json_str = match.group(1).strip()
else:
json_str = text.strip()
print("=== Extracted JSON Content ===")
print(json_str)
return json.loads(json_str)
return json.loads(json_str)
def find_relevant_page_via_map(objective, url, app):
print(f"{Colors.CYAN}Objective: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Searching website: {url}{Colors.RESET}")
map_search_parameter = query_openai(f"Provide 1-2 search keywords for: {objective}")
map_website = app.map_url(url, params={"search": map_search_parameter})
links = map_website.get('urls', []) or map_website.get('links', [])
rank_prompt = f"""Rank these URLs for relevance to '{objective}'. Respond only with JSON:
{json.dumps(links, indent=2)}"""
ranked_json = query_openai(rank_prompt)
ranked_results =extract_json_from_response(ranked_json)
return [result["url"] for result in ranked_results[:3]]
def main():
url = input(f"{Colors.BLUE}Enter the website: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
links = find_relevant_page_via_map(objective, url, app)
if links:
print(f"{Colors.GREEN}Relevant links found:{Colors.RESET}")
for link in links:
print(link)
else:
print(f"{Colors.RED}No relevant links found.{Colors.RESET}")
if __name__ == "__main__":
main()
4。你以为这就完了吗,不,上面的内容没有一点挑战性,有挑战的是我们本地部署源代码,来替代调用爬虫官网获取的api的作用(https://github.com/mendableai/firecrawl.git)
1)安装node。20.17.0版本
我是用nvm直接安装的
nvm install 20.17.0
nvm use 20.17.0
2)安装pnpm。9.12.2
npm install -g pnpm@9.12.2
3)安装redis
sudo apt-get install lsb-release curl gpg
curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg
sudo chmod 644 /usr/share/keyrings/redis-archive-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/redis.list
sudo apt-get update
sudo apt-get install redis
sudo systemctl enable redis-server
sudo systemctl start redis-server
4)配置.env
cd /apps/api/
vim .env
我的.env内容如下:
# ===== Required ENVS =====
NUM_WORKERS_PER_QUEUE=8
PORT=3002
HOST=0.0.0.0
REDIS_URL=redis://localhost:6379
REDIS_RATE_LIMIT_URL=redis://localhost:6379
USE_DB_AUTHENTICATION=false
# ===== Optional ENVS =====
# Supabase 不使用,留空即可
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
# 其他可选,根据你是否用到再填
TEST_API_KEY=
SCRAPING_BEE_API_KEY=
OPENAI_API_KEY=
BULL_AUTH_KEY=
PLAYWRIGHT_MICROSERVICE_URL=
LLAMAPARSE_API_KEY=
SLACK_WEBHOOK_URL=
POSTHOG_API_KEY=
POSTHOG_HOST=
4)安装依赖项
# cd apps/api # to make sure you're in the right folder
pnpm install # make sure you have pnpm version 9+!
5)启动服务,第一个服务
redis,刚才已经启动过了
第二个服务:
cd apps/api/ 目录并运行
pnpm run workers
第三个服务
导航到 apps/api/ 目录并运行
pnpm run start
6)测试
curl -X GET http://localhost:3002/test
会出现hello world
7)测试抓取端点
curl -X POST http://localhost:3002/v1/crawl \
-H 'Content-Type: application/json' \
-d '{
"url": "https://mendable.ai"
}'
返回
8)加入ai功能实现官网的能力
第一步,我们先抓取
curl -X POST http://localhost:3002/v1/crawl -H 'Content-Type: application/json' -d '{
"url": "https://docs.cline.bot/"
}'
然后得到结果,将结果保存到txt中
curl http://localhost:3002/v1/crawl/d40ed298-c2e5-4c0a-99f8-19f17f3f3f7c > out.txt
然后将其中的markdown信息保存起来
用下面的脚本
import json
# ===== 配置 =====
INPUT_JSON_FILE = "out.txt" # 你的 firecrawl 返回的 json 文件
OUTPUT_MARKDOWN_FILE = "cline_docs.md" # 输出的 markdown 文件
# ===== 主程序 =====
def extract_and_save():
with open(INPUT_JSON_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
if not data.get("success"):
print("❌ JSON 非正常,可能爬虫失败")
return
all_md = ""
# 提取所有 data[i]["markdown"]
for i, item in enumerate(data.get("data", [])):
md = item.get("markdown", "")
all_md += f"\n\n---\n\n# Page {i+1}\n\n" # 每一页加个分隔
all_md += md
# 保存到 markdown 文件
with open(OUTPUT_MARKDOWN_FILE, "w", encoding="utf-8") as f:
f.write(all_md)
print(f"✅ 成功提取 {len(data.get('data', []))} 个 markdown 页面")
print(f"✅ 已保存到 {OUTPUT_MARKDOWN_FILE}")
if __name__ == "__main__":
extract_and_save()
得到一个md文件
再然后通过大模型分析md文件
import os
import json
from openai import OpenAI
# ==== 配置 ====
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key="sk-or"
)
MD_FILE = "cline_docs.md"
CHUNK_SIZE = 4000 # 按 token 预估, 可调整
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# ==== 工具函数 ====
def split_markdown(md_text, chunk_size=4000):
"""将markdown分块"""
paragraphs = md_text.split("\n\n")
chunks = []
current_chunk = ""
for p in paragraphs:
if len(current_chunk) + len(p) < chunk_size:
current_chunk += p + "\n\n"
else:
chunks.append(current_chunk.strip())
current_chunk = p + "\n\n"
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
# ==== LLM 处理 ====
def ask_llm(text, index):
completion = client.chat.completions.create(
extra_headers={
"HTTP-Referer": "http://localhost",
"X-Title": "firecrawl-agent",
},
model="google/gemini-2.5-pro-exp-03-25:free",
messages=[
{"role": "user", "content": f"请总结以下文档内容:\n\n{text}"}
]
)
result = completion.choices[0].message.content.strip()
with open(f"{OUTPUT_DIR}/chunk_{index+1}.txt", "w", encoding="utf-8") as f:
f.write(result)
print(f"✅ chunk_{index+1} 已完成")
# ==== 主流程 ====
if __name__ == "__main__":
with open(MD_FILE, "r", encoding="utf-8") as f:
markdown = f.read()
chunks = split_markdown(markdown, CHUNK_SIZE)
print(f"共分成 {len(chunks)} 个 chunk")
for i, chunk in enumerate(chunks):
ask_llm(chunk, i)
print("\n✅ 全部分析完成,已保存到 outputs/ 目录")
最后得到outputs目录(要注意免费api的速率限制)