db = $db; } function geturlfile($url) { $url = trim($url); $content = ''; if (extension_loaded('curl')) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 0); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } return trim($content); } function get_all_url($code) { preg_match_all('/"\\' ]+)["|\\']?\\s*[^>]*>([^>]+)<\\/a>/is', $code, $arr); return array('name' => $arr[2], 'url' => $arr[1]); } function get_sub_content($str, $start, $end) { $start = trim($start); $end = trim($end); if ($start == '' || $end == '') { return $str; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } function vd($var) { echo " \\r\\n"; echo ""; }}?> geturlfile($url);//定义采集列表区间$start = '\\r\\n"; var_dump($var); echo "\\r\\n\\r\\n"; echo "';$end = '';//获取区间内的文章URL和TITLE$code = $gather->get_sub_content($html, $start, $end);$newsAry = $gather->get_all_url($code);//打印出结果//$gather->vd($newsAry);$tarGetUrl = $newsAry['url'][0];//获取目标网址HTML$html = $gather->geturlfile($tarGetUrl);//定义采集列表区间$start = '
php 文章采集正则代码
//采集html function getwebcontent($url){ $ch = curl_init(); $timeout = 10; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); $contents = trim(curl_exec($ch)); curl_close($ch); return $contents; } //获得标题和url $string = getwebcontent('http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2'); //正则匹配
[\s|\S]*?<\/div>/",$content_html,$matches); $article[content][$key] = $matches[0]; } //不转码还真不能保存成文件 foreach($article[title] as $key=>$value){ $article[title][$key] = iconv('utf-8', 'gbk', $value);//转码 } //存入文件 $num = count($article['title']); for($i=0; $i<$num; $i++){ file_put_contents("{ $article[title][$i]}.txt", $article['content'][$i]); } ?>