核心思路分析
基于 ThinkPHP 容器机制和反射实现,我们可以构建一个类似 think\Manager
的爬虫驱动系统。
1. 数据库结构设计
爬虫驱动配置表
CREATE TABLE `crawl_drivers` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`site_id` int(11) NOT NULL COMMENT '站点ID',
`site_name` varchar(100) NOT NULL COMMENT '站点名称',
`driver_name` varchar(100) NOT NULL COMMENT '驱动名称(hdmoli)',
`driver_class` varchar(255) NOT NULL COMMENT '驱动类路径',
`driver_config` text COMMENT '驱动配置(JSON)',
`status` tinyint(1) DEFAULT 1 COMMENT '状态',
`created_at` timestamp DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uk_site_driver` (`site_id`, `driver_name`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
初始化数据
INSERT INTO `crawl_drivers` (`site_id`, `site_name`, `driver_name`, `driver_class`, `driver_config`) VALUES
(1, 'Hdmoli', 'hdmoli', 'app\\services\\docksite\\drivers\\HdmoliDriver', '{"timeout": 30, "retry": 3}');
2. 爬虫驱动管理器
<?php
namespace app\services\docksite;
use think\Manager;
use ReflectionClass;
use InvalidArgumentException;
class CrawlDriverManager extends Manager
{
protected $namespace = 'app\\services\\docksite\\drivers\\';
/**
* 获取驱动实例
*/
public function driver(string $name = null)
{
$name = $name ?: $this->getDefaultDriver();
if (is_null($name)) {
throw new InvalidArgumentException('无法解析爬虫驱动');
}
return $this->drivers[$name] = $this->getDriver($name);
}
/**
* 创建驱动实例
*/
protected function createDriver(string $name)
{
$driverInfo = $this->getDriverConfig($name);
if (!$driverInfo) {
throw new InvalidArgumentException("驱动 [{$name}] 不存在");
}
$driverClass = $driverInfo['driver_class'];
// 使用反射检查类是否存在
if (!class_exists($driverClass)) {
throw new InvalidArgumentException("驱动类 [{$driverClass}] 不存在");
}
$reflect = new ReflectionClass($driverClass);
// 检查是否实现了爬虫驱动接口
if (!$reflect->implementsInterface(CrawlDriverInterface::class)) {
throw new InvalidArgumentException("驱动类必须实现 CrawlDriverInterface 接口");
}
// 通过容器创建实例,支持依赖注入
$config = json_decode($driverInfo['driver_config'] ?? '[]', true);
return $this->app->invokeClass($driverClass, [$config]);
}
/**
* 从数据库获取驱动配置
*/
protected function getDriverConfig(string $name)
{
return app()->db->name('crawl_drivers')
->where('driver_name', $name)
->where('status', 1)
->find();
}
/**
* 根据站点ID获取驱动
*/
public function getBySiteId(int $siteId)
{
$driverInfo = app()->db->name('crawl_drivers')
->where('site_id', $siteId)
->where('status', 1)
->find();
if (!$driverInfo) {
throw new InvalidArgumentException("站点 [{$siteId}] 未配置爬虫驱动");
}
return $this->driver($driverInfo['driver_name']);
}
public function getDefaultDriver()
{
return null; // 无默认驱动
}
}
3. 爬虫驱动接口
<?php
namespace app\services\docksite;
interface CrawlDriverInterface
{
/**
* 获取列表页数据
*/
public function getListData(string $url, int $siteId): array;
/**
* 获取今日更新数据
*/
public function getTodayData(string $url, int $siteId): array;
/**
* 获取详情页数据
*/
public function getDetailData(string $url, int $siteId): array;
/**
* 获取所有影视数据
*/
public function getAllFilms(int $siteId, string $siteUrl): bool;
}
4. Hdmoli驱动实现
<?php
namespace app\services\docksite\drivers;
use QL\QueryList;
use app\services\docksite\CrawlDriverInterface;
use app\services\docksite\shequ\BaseService;
class HdmoliDriver extends BaseService implements CrawlDriverInterface
{
protected $config;
public function __construct(array $config = [])
{
$this->config = array_merge([
'timeout' => 30,
'retry' => 3,
'user_agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
], $config);
}
/**
* 获取列表页数据
*/
public function getListData(string $url, int $siteId): array
{
$rules = $this->getListRules();
return $this->crawlData($url, $rules, $siteId);
}
/**
* 获取今日更新数据
*/
public function getTodayData(string $url, int $siteId): array
{
$rules = $this->getTodayRules();
return $this->crawlTodayData($url, $rules, $siteId);
}
/**
* 获取详情页数据
*/
public function getDetailData(string $url, int $siteId): array
{
$rules = $this->getDetailRules();
return $this->crawlData($url, $rules, $siteId);
}
/**
* 获取所有影视数据
*/
public function getAllFilms(int $siteId, string $siteUrl): bool
{
// 实现原有的 getFilms 逻辑
return true;
}
/**
* 通用爬取方法
*/
protected function crawlData(string $url, array $rules, int $siteId): array
{
$html = $this->makeRequest($url);
$ql = QueryList::getInstance();
$ql = $ql->html($html);
$ql = $ql->rules($rules['selectors'])->range($rules['range'])->query();
return $ql->getData(function ($item) use ($siteId) {
$item['site_id'] = $siteId;
return $item;
})->all();
}
/**
* 今日更新专用爬取方法
*/
protected function crawlTodayData(string $url, array $rules, int $siteId): array
{
$html = $this->makeRequest($url);
$ql = QueryList::getInstance();
$ql = $ql->html($html);
$ql->find('div.module-item')->wrap("<div class='goods-item'></div>");
return $ql->rules($rules['selectors'])
->range($rules['range'])
->query()
->getData(function ($item) use ($siteId) {
$item['site_id'] = $siteId;
return $item;
})
->all();
}
/**
* 发起HTTP请求
*/
protected function makeRequest(string $url): string
{
// 使用原有的 curl_request 方法或重新实现
return static::curl_request($url);
}
/**
* 获取列表页规则
*/
protected function getListRules(): array
{
return [
'selectors' => [
'href' => ['a.stui-vodlist__thumb', 'href'],
'title' => ['a.stui-vodlist__thumb', 'attr(title)']
],
'range' => 'ul.stui-vodlist>li'
];
}
/**
* 获取今日更新规则
*/
protected function getTodayRules(): array
{
return [
'selectors' => [
'href' => ['a.module-card-item-poster', 'href'],
'title' => ['div.module-card-item-title>a>strong', 'text']
],
'range' => 'div.goods-item'
];
}
/**
* 获取详情页规则
*/
protected function getDetailRules(): array
{
return [
'selectors' => [
'title' => ['.title', 'text'],
'content' => ['.content', 'text']
],
'range' => '.detail-container'
];
}
}
5. 重构后的HdmoliService
<?php
namespace app\services\docksite\shequ;
use app\services\docksite\CrawlDriverManager;
class HdmoliService extends BaseService
{
/**
* 获取异步数据 - 使用驱动方式
*/
public static function getAsyncData($url, $siteId)
{
/** @var CrawlDriverManager $crawlManager */
$crawlManager = app()->make(CrawlDriverManager::class);
$driver = $crawlManager->getBySiteId($siteId);
return $driver->getListData($url, $siteId);
}
/**
* 获取今日更新 - 使用驱动方式
*/
public static function getAsyncTodayData($url, $siteId)
{
/** @var CrawlDriverManager $crawlManager */
$crawlManager = app()->make(CrawlDriverManager::class);
$driver = $crawlManager->getBySiteId($siteId);
return $driver->getTodayData($url, $siteId);
}
/**
* 获取所有影视剧 - 使用驱动方式
*/
public static function getFilms($siteId, $siteUrl)
{
/** @var CrawlDriverManager $crawlManager */
$crawlManager = app()->make(CrawlDriverManager::class);
$driver = $crawlManager->getBySiteId($siteId);
return $driver->getAllFilms($siteId, $siteUrl);
}
}
6. 服务注册
在 config/provider.php
中注册服务:
return [
// ... 其他服务
\app\services\docksite\CrawlDriverManager::class,
];
优势总结
- 解耦合: 爬虫逻辑与具体实现分离
- 可扩展: 新增站点只需实现接口并配置数据库
- 动态配置: 通过数据库动态管理驱动配置
- 依赖注入: 利用 ThinkPHP 容器自动注入依赖
- 反射机制: 运行时动态加载和验证驱动类
- 统一管理: 类似框架内置的缓存、数据库等管理方式
这种设计充分利用了您代码仓库中的 ThinkPHP 框架特性,实现了高度可配置和可扩展的爬虫驱动系统。