cms-manage/app/command/service/step/CheckKeywords.php

124 lines
4.4 KiB
PHP

<?php
namespace app\command\service\step;
use think\facade\Db;
class CheckKeywords
{
public static function run($taskData, $db)
{
$analysis = [
'deadLink' => []
];
$keywords = $taskData['keywords'];
if (empty($keywords)) {
return [];
}
// 处理栏目页
$columnPages = $taskData['cate'];
$cateIds = [];
$cateId2Alias = [];
$opts = [
'http'=> [
'method' => "GET",
'timeout' => 10,
]
];
$context = stream_context_create($opts);
foreach ($columnPages as $vo) {
$cateIds[] = $vo['id'];
$cateId2Alias[$vo['id']] = $vo['alias'];
try {
$nowPageHtml = file_get_contents('http://' . $taskData['websiteUrl'] . $vo['alias'], false, $context);
echo "查询获取到页面数据" . PHP_EOL;
$totalWords = self::getCleanContent($nowPageHtml);
foreach ($keywords as $v) {
$analysis[$vo['alias']][$v['name']] = round((substr_count($totalWords, $v['name'])
* mb_strlen($v['name'])) / mb_strlen($totalWords), 4);
}
} catch (\Exception $e) {
echo "获取到死链" . 'http://' . $taskData['websiteUrl'] . $vo['alias'] . PHP_EOL;
$analysis['deadLink'][] = 'http://' . $taskData['websiteUrl'] . $vo['alias'];
}
}
// 处理文章页
if (!empty($cateIds)) {
$pageSize = 20;
$total = $db->select('count(*) as `c_total`')->from(env('database.prefix') . 'category_sub_content')
->where("category_id in(" . implode(',', $cateIds) . ")")->row()['c_total'];
$totalPage = ceil($total / $pageSize);
for ($nowPage = 1; $nowPage <= $totalPage; $nowPage++) {
$offset = ($nowPage - 1) * $pageSize;
$articles = $db->select('category_id,sub_content_id')->from(env('database.prefix') . 'category_sub_content')
->where("category_id in(" . implode(',', $cateIds) . ")")->limit($pageSize)->offset($offset)->query();
foreach ($articles as $vo) {
if (isset($cateId2Alias[$vo['category_id']])) {
$url = $cateId2Alias[$vo['category_id']] . '/' . $vo['sub_content_id'];
try {
$nowPageHtml = file_get_contents('http://' . $taskData['websiteUrl'] . $url, false, $context);
$totalWords = self::getCleanContent($nowPageHtml);
foreach ($keywords as $v) {
$analysis[$url][$v['name']] = round((substr_count($totalWords, $v['name']) * mb_strlen($v['name'])) / mb_strlen($totalWords), 4);
}
} catch (\Exception $e) {
$analysis['deadLink'][] = 'http://' . $taskData['websiteUrl'] . $url;
}
}
}
}
}
// 最后统计总数据
$analysisFinal = [];
foreach ($analysis as $key => $vo) {
if ($key != 'deadLink') {
$analysisFinal['keywords'][] = '路径 ' . $key . ' 的关键词密度是:' . round(array_sum($vo), 2);
} else {
$analysisFinal[$key] = $vo;
}
}
return $analysisFinal;
}
private static function getCleanContent($nowPage)
{
// 获取所有的img的alt
$html = \phpQuery::newDocument($nowPage);
$images = $html['img'];
$a = $html['a'];
// 去除js标签内的内容
$html['script']->html('');
$html['noscript']->html('');
$html['style']->html('');
$imagesAltMap = [];
foreach ($images as $img) {
if (!empty(trim(pq($img)->attr('alt')))) {
$imagesAltMap[] = trim(pq($img)->attr('alt'));
}
}
$aTitleMap = [];
foreach ($a as $item) {
if (!empty(trim(pq($item)->attr('title')))) {
$aTitleMap[] = trim(pq($item)->attr('title'));
}
}
$cleanWords = trim(str_replace(' ', '', str_replace(PHP_EOL, '', strip_tags($html))));
return $cleanWords . implode('', $imagesAltMap) . implode('', $aTitleMap);
}
}