php 文章采集正则代码

2015-01-24信息快讯网

php 文章采集代码主要是应用了正则表达式。

 
//采集html 
function getwebcontent($url){ 
$ch = curl_init(); 
$timeout = 10; 
curl_setopt($ch, CURLOPT_URL, $url); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); 
$contents = trim(curl_exec($ch)); 
curl_close($ch); 
return $contents; 
} 


//获得标题和url 
$string = 
getwebcontent('http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2'); 
//正则匹配<li>获取标题和地址 
preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER); 
foreach($out as $key => $value){ 
$article['title'][] = $out[$key][2]; 
$article['link'][] = "http://www.***.com/learn/article/".$out[$key][1]; 
} 
//根据url获取文章内容 
foreach($article['link'] as $key=>$value){ 
$content_html = getwebcontent($article['link'][$key]); 
preg_match("/<div id=pagenum_0(.*)>[\s|\S]*?<\/div>/",$content_html,$matches); 
$article[content][$key] = $matches[0]; 

} 
//不转码还真不能保存成文件 
foreach($article[title] as $key=>$value){ 
$article[title][$key] = iconv('utf-8', 'gbk', $value);//转码 
} 
//存入文件 
$num = count($article['title']); 
for($i=0; $i<$num; $i++){ 
file_put_contents("{$article[title][$i]}.txt", $article['content'][$i]); 
} 
?> 
©2014-2024 dbsqp.com