header = $this->agent[rand(0,count($this->agent) - 1)];
$this->referer = empty($referer)?'http://weixin.sogou.com/' : $referer;
$this->host = empty($host)?'weixin.sogou.com' : $host;
/** @var 处理微信图片的防盗链 */
$this->antiLeech = '/Public/Home/GetWeChatImg.php?url=';
$this->antiLeech = 'http://'.$_SERVER['SERVER_NAME'].'/Public/Home/GetWeChatImg.php?url=';
}
/**
* 爬取内容
* @author bignerd
* @since 2016-08-16T10:13:58+0800
* @param $url
*/
public function _get($url)
{
// $ch=curl_init($url);
// $options = [
// CURLOPT_USERAGENT => $this->agent,
// CURLOPT_REFERER => $this->referer,
// ];
// curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
// curl_setopt($ch,CURLOPT_BINARYTRANSFER,true);
// curl_setopt($ch,CURLOPT_TIMEOUT,60);
// $output=curl_exec($ch);
// return $output;
$html = file_get_contents($url);
return $html;
}
public function crawByUrl($url)
{
$content = $this->_get($url);
$basicInfo = $this->articleBasicInfo($content);
list($content_html, $content_text) = $this->contentHandle($content);
$result= array_merge($basicInfo,array('content_html' => $content_html,'content_text' => $content_text));
$result[cover]=$result[cover]?$this->antiLeech.$result[cover]:"";
return $result;
}
/**
* 处理微信文章源码,提取文章主体,处理图片链接
* @author bignerd
* @since 2016-08-16T15:59:27+0800
* @param $content 抓取的微信文章源码
* @return [带图html文本,无图html文本]
*/
public function contentHandle($content)
{
$content_html_pattern = '/
(.*?)<\/div>/s';
preg_match_all($content_html_pattern, $content, $html_matchs);
$content_html = $html_matchs[1][0];
/** @var 带图片html文本 */
$content_html = preg_replace_callback('/data-src="(.*?)"/', function($matches){
//如果包含qq视频,则正常返回
if(strstr($matches[1],"v.qq.com")){
return 'src='.$matches[1];
}else{
return 'src='.$this->antiLeech.urlencode($matches[1]);
}
}, $content_html);
/** @var 无图html文本 */
$content_text = preg_replace('/
/s','',$content_html);
return array($content_html,$content_text);
}
/**
* 获取文章的基本信息
* @author bignerd
* @since 2016-08-16T17:16:32+0800
* @param $content 文章详情源码
* @return $basicInfo
*/
public function articleBasicInfo($content)
{
//待获取item
$item = array(
'ct' => 'date',//发布时间
'msg_title' => 'title',//标题
'msg_desc' => 'digest',//描述
'msg_link' => 'content_url',//文章链接
'msg_cdn_url' => 'cover',//封面图片链接
'nickname' => 'wechatname',//公众号名称
);
$basicInfo = array(
'author' => '',
'copyright_stat' => '',
);
foreach ($item as $k => $v) {
$pattern = '/ var '.$k.' = "(.*?)";/s';
preg_match_all($pattern,$content,$matches);
if(array_key_exists(1, $matches) && !empty($matches[1][0])){
$basicInfo[$v] = $this->htmlTransform($matches[1][0]);
}else{
$basicInfo[$v] = '';
}
}
/** 获取作者 */
preg_match('/(.*?)<\/em>/s', $content, $matchAuthor);
if(!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1];
/** 文章类型 */
preg_match('/(.*?)<\/span>/s', $content, $matchType);
if(!empty($matchType[1])) $basicInfo['copyright_stat'] = $matchType[1];
return $basicInfo;
}
/**
* 特殊字符转换
* @author bignerd
* @since 2016-08-16T17:30:52+0800
* @param $string
* @return $string
*/
public function htmlTransform($string)
{
$string = str_replace('"','"',$string);
$string = str_replace('&','&',$string);
$string = str_replace('amp;','',$string);
$string = str_replace('<','<',$string);
$string = str_replace('>','>',$string);
$string = str_replace(' ',' ',$string);
$string = str_replace("\\", '',$string);
return $string;
}
}