cms-manage/app/command/service/step/CheckKeywords.php

<?php

namespace app\command\service\step;

use think\facade\Db;

class CheckKeywords
{
    public static function run($taskData, $db)
    {
        $analysis = [
            'deadLink' => []
        ];
        $keywords = $taskData['keywords'];
        if (empty($keywords)) {
            return [];
        }

        // 处理栏目页
        $columnPages = $taskData['cate'];
        $cateIds = [];
        $cateId2Alias = [];

        $opts = [
            'http'=> [
                'method' => "GET",
                'timeout' => 10,
            ]
        ];
        $context = stream_context_create($opts);

        foreach ($columnPages as $vo) {
            $cateIds[] = $vo['id'];
            $cateId2Alias[$vo['id']] = $vo['alias'];

            try {

                $nowPageHtml = file_get_contents('http://'  . $taskData['websiteUrl'] . $vo['alias'], false, $context);
                echo "查询获取到页面数据" . PHP_EOL;
                $totalWords = self::getCleanContent($nowPageHtml);
                foreach ($keywords as $v) {
                    $analysis[$vo['alias']][$v['name']] = round((substr_count($totalWords, $v['name'])
                            * mb_strlen($v['name'])) / mb_strlen($totalWords), 4);
                }
            } catch (\Exception $e) {
                echo "获取到死链" . 'http://'  . $taskData['websiteUrl'] . $vo['alias'] . PHP_EOL;
                $analysis['deadLink'][] = 'http://'  . $taskData['websiteUrl'] . $vo['alias'];
            }
        }

        // 处理文章页
        if (!empty($cateIds)) {
            $pageSize = 20;
            $total = $db->select('count(*) as `c_total`')->from(env('database.prefix') . 'category_sub_content')
                ->where("category_id in(" . implode(',', $cateIds) . ")")->row()['c_total'];
            $totalPage = ceil($total / $pageSize);
            for ($nowPage = 1; $nowPage <= $totalPage; $nowPage++) {

                $offset = ($nowPage - 1) * $pageSize;
                $articles = $db->select('category_id,sub_content_id')->from(env('database.prefix') . 'category_sub_content')
                    ->where("category_id in(" . implode(',', $cateIds) . ")")->limit($pageSize)->offset($offset)->query();

                foreach ($articles as $vo) {
                    if (isset($cateId2Alias[$vo['category_id']])) {

                        $url = $cateId2Alias[$vo['category_id']] . '/' . $vo['sub_content_id'];

                        try {
                            $nowPageHtml = file_get_contents('http://'  . $taskData['websiteUrl'] . $url, false, $context);
                            $totalWords = self::getCleanContent($nowPageHtml);
                            foreach ($keywords as $v) {
                                $analysis[$url][$v['name']] = round((substr_count($totalWords, $v['name']) * mb_strlen($v['name'])) / mb_strlen($totalWords), 4);
                            }
                        } catch (\Exception $e) {
                            $analysis['deadLink'][] = 'http://'  . $taskData['websiteUrl'] . $url;
                        }
                    }
                }
            }
        }


        // 最后统计总数据
        $analysisFinal = [];
        foreach ($analysis as $key => $vo) {
            if ($key != 'deadLink') {
                $analysisFinal['keywords'][] = '路径 ' . $key . ' 的关键词密度是：' . round(array_sum($vo), 2);
            } else {
                $analysisFinal[$key] = $vo;
            }
        }

        return $analysisFinal;
    }

    private static function getCleanContent($nowPage)
    {
        // 获取所有的img的alt
        $html = \phpQuery::newDocument($nowPage);
        $images = $html['img'];
        $a = $html['a'];
        // 去除js标签内的内容
        $html['script']->html('');
        $html['noscript']->html('');
        $html['style']->html('');

        $imagesAltMap = [];
        foreach ($images as $img) {
            if (!empty(trim(pq($img)->attr('alt')))) {
                $imagesAltMap[] = trim(pq($img)->attr('alt'));
            }
        }

        $aTitleMap = [];
        foreach ($a as $item) {
            if (!empty(trim(pq($item)->attr('title')))) {
                $aTitleMap[] = trim(pq($item)->attr('title'));
            }
        }

        $cleanWords = trim(str_replace(' ', '', str_replace(PHP_EOL, '', strip_tags($html))));
        return $cleanWords . implode('', $imagesAltMap) . implode('', $aTitleMap);
    }
}