n8n部署
见上一篇内容
部署
https://blog.csdn.net/weixin_39348931/article/details/148586843?spm=1001.2014.3001.5501
创建工作流
复制以下内容
{
"nodes": [
{
"parameters": {
"options": {}
},
"type": "@n8n/n8n-nodes-langchain.chatTrigger",
"typeVersion": 1.1,
"position": [
0,
0
],
"id": "f6a44802-0a91-47f5-b7fb-a9411aaf8be0",
"name": "When chat message received",
"webhookId": "ae262f47-33cc-4204-a83f-e87a3c4adbcf",
"notesInFlow": true,
"notes": "请输入你想智能浏览的网址"
},
{
"parameters": {
"jsCode": "// 从输入中安全提取第一个URL\ntry {\n const input = $input.first();\n if (!input || !input.json || !input.json.chatInput) {\n throw new Error(\"Invalid input format\");\n }\n \n const message = input.json.chatInput.trim();\n if (!message) {\n throw new Error(\"Empty message received\");\n }\n \n // 支持各种URL格式\n const regex = /https?:\\/\\/[\\w-]+(\\.[\\w-]+)+(:\\d+)?([\\w/#?&%@+=-]*[\\w/#?&%@+=])?/g;\n const urls = message.match(regex);\n \n if (!urls || urls.length === 0) {\n throw new Error(\"No URL found in message\");\n }\n \n // 获取最佳匹配的URL\n const bestMatch = urls.sort((a, b) => b.length - a.length)[0];\n let finalUrl = bestMatch;\n \n // 清理常见的结尾标点\n const punctuations = ['.', ',', '!', '?', ')', ']', '}', ';', ':', '|', '\"', \"'\", '...'];\n while (punctuations.includes(finalUrl.slice(-1))) {\n finalUrl = finalUrl.slice(0, -1);\n }\n \n return {\n json: {\n extractedUrl: finalUrl,\n allFoundUrls: urls,\n originalMessage: message,\n success: true\n }\n };\n \n} catch (error) {\n return {\n json: {\n error: error.message,\n userInput: $input.first().json.chatInput || \"N/A\",\n success: false\n }\n };\n}"
},
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [
220,
0
],
"id": "969af117-f012-465f-bcb7-a6a716c83b88",
"name": "Code"
},
{
"parameters": {
"url": "=http://10.253.4.55:8000/crawler",
"sendQuery": true,
"queryParameters": {
"parameters": [
{
"name": "url",
"value": "={{ $json.extractedUrl }}"
}
]
},
"options": {}
},
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
440,
0
],
"id": "6b5b8715-c562-4383-9ef5-42b627f33fa0",
"name": "HTTP Request"
}
],
"connections": {
"When chat message received": {
"main": [
[
{
"node": "Code",
"type": "main",
"index": 0
}
]
]
},
"Code": {
"main": [
[
{
"node": "HTTP Request",
"type": "main",
"index": 0
}
]
]
}
},
"pinData": {},
"meta": {
"instanceId": "29034d9b8fafb4702d23fb5e38d157bfcc3104f9c0e0c682010b18fcca669c87"
}
}
粘贴到工作流面板
ctrlc+ctrlv
看到如下工作流
启动后端爬虫服务
import json
import subprocess
from test_cra import main
from fastapi import FastAPI, BackgroundTasks
import uvicorn
from fastapi.middleware.cors import CORSMiddleware
import pathlib
import os
import time
import config
app = FastAPI()
# 2、声明一个 源 列表;重点:要包含跨域的客户端 源
origins = ["*"]
# 3、配置 CORSMiddleware
app.add_middleware(
CORSMiddleware,
allow_origins=origins, # 允许访问的源
allow_credentials=True, # 支持 cookie
allow_methods=["*"], # 允许使用的请求方法
allow_headers=["*"], # 允许携带的 Headers
)
# 调用url为http://localhost:5555/crawler?url=xxx
@app.get("/crawler")
async def clawer(url: str):
# 先根据base_url判断是哪个平台,然后调用对应的爬虫
base_url = "/".join(url.split("/")[0:3])
print(base_url)
platform = ""
if "zhihu.com" in base_url:
platform='zhihu'
config.ZHIHU_SPECIFIED_ID_LIST = [url]
dir_path = pathlib.Path(__file__).parent.absolute()
# 构建文件路径
date = time.strftime('%Y-%m-%d', time.localtime())
file_path = dir_path / f"data/{platform}/json/detail_contents_{date}.json"
if not os.path.exists(file_path):
target = dir_path / f"data/{platform}/json"
target.mkdir(parents=True, exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
json.dump({}, f, ensure_ascii=False, indent=4)
now_time = time.time()
os.chdir(dir_path)
print(os.getcwd())
# 获取文件更新时间
file_update_time = pathlib.Path(file_path).stat().st_mtime
# 进入无限循环
while True:
subprocess.run([os.path.join(dir_path,"venv/Scripts/python.exe"), "main.py", "--platform", platform,"--type", "detail", "--lt", "cookie", "--save_data_option", "json", "--zhihu_url", url])
file_update_time = pathlib.Path(file_path).stat().st_mtime
if file_update_time > now_time:
break
with open(file_path, "r", encoding="utf-8") as f:
file_data = json.load(f)
print(file_data)
return {"message": file_data[url]}
if __name__ == "__main__":
# 使用字符串形式传递app,以便支持reload等功能
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)