@
sun2920989 之前做过一个 cli 的爬虫,当时是用 swoole 解决的。做法就是专门起一个负责 task 的进程,用 \Swoole\Server 监听一个 sock 文件,具体文档
https://wiki.swoole.com/#/consts?id=socket-%e7%b1%bb%e5%9e%8b 。做的时候有些偷懒没有用 redis,而是直接用 \Swoole\Table 做基础的共享数据存储,sock 负责队列控制。虽然用 swoole 实现在语法上很好看,用起来却比 curl_multi_exec 要麻烦得多,后续再也没有用过这个方案,在这边讨论这个方案稍微有点超纲,算是之前踩过的坑提醒一下,附一个当时写的部分源码:
```php
protected function execute(Input $input, Output $output)
{
/**
* $initDataTable 基础配置表,用于实例化爬虫类
* |------------------------------------|
* | key | value |
* |------------------------------------|
* | service | service |
* | baseUrl | https:// |
* | cookieJar | json_encode(cookieJar) |
* |------------------------------------|
*/
$initDataTable = new \Swoole\Table(4);
$initDataTable->column('value', \Swoole\Table::TYPE_STRING, 512);
$initDataTable->create();
/**
* $acquiredListTable 记录已由 getList 方法执行过的操作
* |------------------------------|
* | key | value |
* |------------------------------|
* | getDatacenters | 0 |
* | getIpsGroupList | 0 |
* | getIpsList | 0 |
* | getSwitchList | 0 |
* | getHardwareModelList | 0 |
* | getPurchaseList | 0 |
* | getServerList | 0 |
* | getHardwareList | 0 |
* |------------------------------|
*/
$acquiredListTable = new \Swoole\Table(16);
$acquiredListTable->column('count', \Swoole\Table::TYPE_INT, 1);
$acquiredListTable->create();
$workingAtomic = new \Swoole\Atomic();
$successedAtomic = new \Swoole\Atomic();
$failedAtomic = new \Swoole\Atomic();
$serv = new \Swoole\Server(Env::get('runtime_path') . 'task.sock', 0, SWOOLE_PROCESS, SWOOLE_SOCK_UNIX_STREAM);
$serv->table = ['initData' => $initDataTable, 'acquiredList' => $acquiredListTable];
$serv->atomic = ['working' => $workingAtomic, 'successed' => $successedAtomic, 'failed' => $failedAtomic];
$serv->set(array('task_worker_num' => 15));
$serv->on('receive', function($serv, $fd, $from_id, $data) use ($output) {
$this->receive($serv, $fd, $from_id, $data, $output);
});
$serv->on('task', function ($serv, $task_id, $from_id, $data) use ($output) {
$this->task($serv, $task_id, $from_id, $data, $output);
});
$serv->on('finish', function ($serv, $task_id, $data) use ($output) {
$this->finish($serv, $task_id, $data, $output);
});
$serv->start();
}
```