124 lines
4.4 KiB
PHP
124 lines
4.4 KiB
PHP
<?php
|
|
|
|
namespace app\command\service\step;
|
|
|
|
use think\facade\Db;
|
|
|
|
class CheckKeywords
|
|
{
|
|
public static function run($taskData, $db)
|
|
{
|
|
$analysis = [
|
|
'deadLink' => []
|
|
];
|
|
$keywords = $taskData['keywords'];
|
|
if (empty($keywords)) {
|
|
return [];
|
|
}
|
|
|
|
// 处理栏目页
|
|
$columnPages = $taskData['cate'];
|
|
$cateIds = [];
|
|
$cateId2Alias = [];
|
|
|
|
$opts = [
|
|
'http'=> [
|
|
'method' => "GET",
|
|
'timeout' => 10,
|
|
]
|
|
];
|
|
$context = stream_context_create($opts);
|
|
|
|
foreach ($columnPages as $vo) {
|
|
$cateIds[] = $vo['id'];
|
|
$cateId2Alias[$vo['id']] = $vo['alias'];
|
|
|
|
try {
|
|
|
|
$nowPageHtml = file_get_contents('http://' . $taskData['websiteUrl'] . $vo['alias'], false, $context);
|
|
echo "查询获取到页面数据" . PHP_EOL;
|
|
$totalWords = self::getCleanContent($nowPageHtml);
|
|
foreach ($keywords as $v) {
|
|
$analysis[$vo['alias']][$v['name']] = round((substr_count($totalWords, $v['name'])
|
|
* mb_strlen($v['name'])) / mb_strlen($totalWords), 4);
|
|
}
|
|
} catch (\Exception $e) {
|
|
echo "获取到死链" . 'http://' . $taskData['websiteUrl'] . $vo['alias'] . PHP_EOL;
|
|
$analysis['deadLink'][] = 'http://' . $taskData['websiteUrl'] . $vo['alias'];
|
|
}
|
|
}
|
|
|
|
// 处理文章页
|
|
if (!empty($cateIds)) {
|
|
$pageSize = 20;
|
|
$total = $db->select('count(*) as `c_total`')->from(env('database.prefix') . 'category_sub_content')
|
|
->where("category_id in(" . implode(',', $cateIds) . ")")->row()['c_total'];
|
|
$totalPage = ceil($total / $pageSize);
|
|
for ($nowPage = 1; $nowPage <= $totalPage; $nowPage++) {
|
|
|
|
$offset = ($nowPage - 1) * $pageSize;
|
|
$articles = $db->select('category_id,sub_content_id')->from(env('database.prefix') . 'category_sub_content')
|
|
->where("category_id in(" . implode(',', $cateIds) . ")")->limit($pageSize)->offset($offset)->query();
|
|
|
|
foreach ($articles as $vo) {
|
|
if (isset($cateId2Alias[$vo['category_id']])) {
|
|
|
|
$url = $cateId2Alias[$vo['category_id']] . '/' . $vo['sub_content_id'];
|
|
|
|
try {
|
|
$nowPageHtml = file_get_contents('http://' . $taskData['websiteUrl'] . $url, false, $context);
|
|
$totalWords = self::getCleanContent($nowPageHtml);
|
|
foreach ($keywords as $v) {
|
|
$analysis[$url][$v['name']] = round((substr_count($totalWords, $v['name']) * mb_strlen($v['name'])) / mb_strlen($totalWords), 4);
|
|
}
|
|
} catch (\Exception $e) {
|
|
$analysis['deadLink'][] = 'http://' . $taskData['websiteUrl'] . $url;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// 最后统计总数据
|
|
$analysisFinal = [];
|
|
foreach ($analysis as $key => $vo) {
|
|
if ($key != 'deadLink') {
|
|
$analysisFinal['keywords'][] = '路径 ' . $key . ' 的关键词密度是:' . round(array_sum($vo), 2);
|
|
} else {
|
|
$analysisFinal[$key] = $vo;
|
|
}
|
|
}
|
|
|
|
return $analysisFinal;
|
|
}
|
|
|
|
private static function getCleanContent($nowPage)
|
|
{
|
|
// 获取所有的img的alt
|
|
$html = \phpQuery::newDocument($nowPage);
|
|
$images = $html['img'];
|
|
$a = $html['a'];
|
|
// 去除js标签内的内容
|
|
$html['script']->html('');
|
|
$html['noscript']->html('');
|
|
$html['style']->html('');
|
|
|
|
$imagesAltMap = [];
|
|
foreach ($images as $img) {
|
|
if (!empty(trim(pq($img)->attr('alt')))) {
|
|
$imagesAltMap[] = trim(pq($img)->attr('alt'));
|
|
}
|
|
}
|
|
|
|
$aTitleMap = [];
|
|
foreach ($a as $item) {
|
|
if (!empty(trim(pq($item)->attr('title')))) {
|
|
$aTitleMap[] = trim(pq($item)->attr('title'));
|
|
}
|
|
}
|
|
|
|
$cleanWords = trim(str_replace(' ', '', str_replace(PHP_EOL, '', strip_tags($html))));
|
|
return $cleanWords . implode('', $imagesAltMap) . implode('', $aTitleMap);
|
|
}
|
|
} |