【网站内容安全检测】之3:获取所有外部域名访问后图像

发布于:2025-06-27 ⋅ 阅读:(19) ⋅ 点赞:(0)

Go语言调用Chrome浏览器去进行截图的操作,对电脑的性能要求比较高,所以速度比较有限,但是目前来看这种方式可以最佳的去获取网页加载后的结果。

main.go

package main

import (
	"context"
	"errors"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"net/url"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/chromedp/chromedp"
)

// 任务结构
type Task struct {
	URL      string
	Filename string
}

// 域名黑名单,包含关键字的域名将被跳过
var blacklist = []string{"edu.cn", "gov.cn"}

var (
	totalTasks    int64 // 总任务数
	finishedTasks int64 // 已完成任务数
)

func main() {
	start := time.Now()
	defer func() {
		if r := recover(); r != nil {
			log.Printf("程序异常退出: %v", r)
		}
	}()

	// 定义命令行参数,增加初始等待时间参数
	urlFile := flag.String("urls", "urls.txt", "包含URL列表的文件路径")
	outputDir := flag.String("output", "screenshots", "截图保存的目录")
	workers := flag.Int("workers", 50, "并发工作线程数(建议1~3)")
	width := flag.Int("width", 1280, "浏览器窗口宽度")
	height := flag.Int("height", 800, "浏览器窗口高度")
	fullPage := flag.Bool("full", false, "是否截取整个页面")
	timeout := flag.Int("timeout", 20, "每个任务的超时时间(秒,建议大于页面加载等待时间,默认120)")
	retry := flag.Int("retry", 3, "失败重试次数")
	initialWait := flag.Int("initialWait", 1, "初始等待时间(秒),用于分散任务启动")
	flag.Parse()

	// 确保timeout参数合理
	if *timeout <= 30 {
		log.Printf("警告:timeout参数过小,已自动调整为60秒以避免context canceled错误!")
		*timeout = 60
	}

	// 创建输出目录
	if _, err := os.Stat(*outputDir); os.IsNotExist(err) {
		if err := os.MkdirAll(*outputDir, 0755); err != nil {
			log.Fatalf("创建输出目录失败: %v", err)
		}
	}

	// 读取URL列表
	urls, err := readURLs(*urlFile)
	if err != nil {
		log.Fatalf("读取URL文件失败: %v", err)
	}

	if len(urls) == 0 {
		log.Fatal("URL列表为空")
	}

	// 统计总任务数
	totalTasks = int64(len(urls))

	// 创建任务通道,增加缓冲大小
	taskCh := make(chan Task, len(urls))

	// 填充任务通道
	go func() {
		for _, url := range urls {
			// 生成文件名
			filename := generateFilename(url, *outputDir)
			taskCh <- Task{URL: url, Filename: filename}
		}
		close(taskCh)
	}()

	// 创建等待组
	var wg sync.WaitGroup

	// 启动进度监控协程
	go func() {
		startTime := time.Now()
		for {
			done := atomic.LoadInt64(&finishedTasks)
			total := totalTasks
			elapsed := time.Since(startTime).Seconds()
			var speed float64 = 0
			if elapsed > 0 {
				speed = float64(done) / elapsed
			}
			remain := 0.0
			if speed > 0 {
				remain = float64(total-done) / speed
			}
			percent := float64(done) / float64(total) * 100
			fmt.Printf("\r进度: %d/%d (%.2f%%) | 速度: %.2f/秒 | 已用: %.0fs | 预计剩余: %.0fs",
				done, total, percent, speed, elapsed, remain)
			if done >= total {
				fmt.Println()
				break
			}
			time.Sleep(1 * time.Second)
		}
	}()

	// 启动工作线程,增加启动间隔
	log.Printf("开始处理 %d 个URL,使用 %d 个工作线程\n", len(urls), *workers)
	for i := 0; i < *workers; i++ {
		// 增加启动间隔,避免同时启动过多线程
		time.Sleep(time.Duration(i*(*initialWait)) * time.Second)

		wg.Add(1)
		go func(workerID int) {
			defer wg.Done()
			processTasks(workerID, taskCh, *width, *height, *fullPage, *timeout, *retry)
		}(i)
	}

	// 等待所有工作线程完成
	wg.Wait()
	elapsed := time.Since(start)

	log.Printf("所有任务完成,耗时: %s\n", elapsed)
}

// 读取URL文件
func readURLs(filePath string) ([]string, error) {
	data, err := ioutil.ReadFile(filePath)
	if err != nil {
		return nil, err
	}

	// 按行分割URL
	var urls []string
	lines := strings.Split(string(data), "\n")
	for _, line := range lines {
		if url := strings.TrimSpace(line); url != "" {
			if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
				url = "https://" + url
			}
			// 检查黑名单
			blacklisted := false
			for _, keyword := range blacklist {
				if strings.Contains(url, keyword) {
					blacklisted = true
					break
				}
			}
			if blacklisted {
				continue
			}
			urls = append(urls, url)
		}
	}

	return urls, nil
}

// 生成文件名
func generateFilename(urlStr, outputDir string) string {
	// 移除URL中的协议部分
	u, err := url.Parse(urlStr)
	if err != nil {
		// 如果解析失败,使用时间戳作为文件名
		return filepath.Join(outputDir, fmt.Sprintf("unknown_%d.png", time.Now().UnixNano()))
	}

	// 使用主机名和路径生成文件名
	filename := strings.ReplaceAll(u.Host+u.Path, "/", "_")
	if len(filename) > 100 {
		filename = filename[:100]
	}

	return filepath.Join(outputDir, filename+".png")
}

// 处理任务
func processTasks(workerID int, taskCh <-chan Task, width, height int, fullPage bool, timeout, retry int) {
	// 优化Chrome选项,增加更多反检测设置
	opts := append(chromedp.DefaultExecAllocatorOptions[:],
		chromedp.Flag("headless", false),
		chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"),
		chromedp.Flag("disable-blink-features", "AutomationControlled"),
		chromedp.Flag("disable-web-security", true),           // 禁用Web安全策略
		chromedp.Flag("allow-running-insecure-content", true), // 允许运行不安全内容
		chromedp.Flag("ignore-certificate-errors", true),      // 忽略SSL证书错误
		chromedp.WindowSize(width, height),
		chromedp.Flag("no-sandbox", true),             // 禁用沙盒模式,在某些环境需要
		chromedp.Flag("disable-setuid-sandbox", true), // 禁用setuid沙盒
	)

	// 每个worker只启动一个Chrome实例
	allocCtx, allocCancel := chromedp.NewExecAllocator(context.Background(), opts...)
	defer allocCancel()
	parentCtx, parentCancel := chromedp.NewContext(allocCtx)
	defer parentCancel()

	for task := range taskCh {
		var success bool
		var attempt int

		for attempt = 1; attempt <= retry; attempt++ {
			log.Printf("工作线程 %d 正在处理 %s (尝试 %d/%d)\n", workerID, task.URL, attempt, retry)

			if attempt > 1 {
				time.Sleep(time.Duration(attempt*2) * time.Second)
			}

			// 每个任务新建tab
			ctx, cancel := chromedp.NewContext(parentCtx)
			err := captureScreenshot(ctx, task.URL, fullPage, timeout, task.Filename)
			cancel()

			if err == nil {
				log.Printf("工作线程 %d 成功保存截图: %s\n", workerID, task.Filename)
				success = true
				break
			}

			log.Printf("工作线程 %d 处理 %s 失败: %v (尝试 %d/%d)\n", workerID, task.URL, err, attempt, retry)
			if err != nil && (strings.Contains(err.Error(), "ERR_NAME_NOT_RESOLVED") ||
				strings.Contains(err.Error(), "context canceled")) {
				log.Printf("域名未被解析,停止对此URL的重试: %s", task.URL)
				break
			}
		}

		// 每个任务完成后,finishedTasks++
		atomic.AddInt64(&finishedTasks, 1)

		if !success {
			log.Printf("工作线程 %d 处理 %s 失败,已达到最大重试次数\n", workerID, task.URL)
			f, err := os.OpenFile("failed_urls.txt", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
			if err == nil {
				f.WriteString(task.URL + "\n")
				f.Close()
			}
		}
	}
}

// 截图函数,优化等待策略
func captureScreenshot(ctx context.Context, url string, fullPage bool, timeout int, outputPath string) error {
	// 设置更长的超时
	ctx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
	defer cancel()

	var buf []byte
	err := chromedp.Run(ctx, chromedp.Tasks{
		chromedp.Navigate(url),
		chromedp.ActionFunc(func(ctx context.Context) error {
			var readyState string
			start := time.Now()
			for {
				err := chromedp.Evaluate(`document.readyState`, &readyState).Do(ctx)
				if err != nil {
					return err
				}
				if readyState == "complete" {
					time.Sleep(2 * time.Second) // 页面加载完成后再等2秒
					return nil
				}
				if time.Since(start) > time.Duration(timeout)*time.Second {
					return errors.New("等待页面加载超时")
				}
				time.Sleep(500 * time.Millisecond)
			}
		}),
		chromedp.FullScreenshot(&buf, 95), // 提高截图质量
	})

	if err != nil {
		if errors.Is(err, context.DeadlineExceeded) {
			log.Printf("截图超时(context deadline exceeded):%s", url)
		} else if errors.Is(err, context.Canceled) {
			log.Printf("截图被取消(context canceled):%s", url)
		} else {
			log.Printf("截图失败: %s, 错误: %+v", url, err)
		}
		return err
	}

	return ioutil.WriteFile(outputPath, buf, 0644)
}

go.mod

module screenshot-tool

go 1.24.4

require (
	github.com/chromedp/cdproto v0.0.0-20250403032234-65de8f5d025b // indirect
	github.com/chromedp/chromedp v0.13.7 // indirect
	github.com/chromedp/sysutil v1.1.0 // indirect
	github.com/go-json-experiment/json v0.0.0-20250211171154-1ae217ad3535 // indirect
	github.com/gobwas/httphead v0.1.0 // indirect
	github.com/gobwas/pool v0.2.1 // indirect
	github.com/gobwas/ws v1.4.0 // indirect
	golang.org/x/sys v0.29.0 // indirect
)

运行命令:

go run main.go

网站公告

今日签到

点亮在社区的每一天
去签到