之前写过curl批处理采集数据,这里贴上完整版本,代码很简单,废话不说,上代码,新手欢迎指教!!!
代码只写到 获取到链接了,至于排名 后边数组的键不就是排名喽。。。
1 <?php 2 /** 3 * Based on yahoo access to data 4 * 5 * @author chujiu <[email protected]> 6 * @copyright 2014.04.26 By chujiu 7 * @version 0.2.1 2014.04.26 8 */ 9 10 class DataCollectionRank { 11 12 const PAGE = 10; 13 public $path = ''; 14 public $main = 91; 15 16 // 添加curl句柄 返回资源 17 PRivate function _gather_data($keyWord) { 18 if(empty($keyword)) { 19 return ''; 20 } 21 $chs = array(); // 句柄 22 $mh = curl_multi_init(); 23 for( $i=1; $i<=$this->main; $i+=self::PAGE ) { 24 $url = 'http://search.yahoo.co.jp/search?p='.urlencode($keyword).'&tid=top_ga1_sa&ei=UTF-8&aq=-1&oq='.urlencode($keyword).'&pstart=1&fr=top_ga1_sa&b='.$i; 25 $ch = curl_init(); 26 //设置选项 27 curl_setopt_array($ch, array( 28 CURLOPT_URL => $url, 29 CURLOPT_HEADER => false, 30 CURLOPT_SSL_VERIFYPEER => false, 31 CURLOPT_RETURNTRANSFER => true, 32 CURLOPT_TIMEOUT => 30, 33 CURLOPT_AUTOREFERER => true 34 ) 35 ); 36 curl_multi_add_handle($mh, $ch); // 添加批处理句柄 37 $chs['handle'][$i]['ch'] = $ch; 38 $chs['handle'][$i]['url'] = $url; 39 } 40 $chs['mh'] = $mh; 41 return $chs; 42 } 43 44 // 处理CURL请求 45 public function exec_curl_get_data($keyword, $path) { 46 $error = ''; 47 $this->path = $path; 48 $chs = $this->_gather_data($keyword); 49 if(empty($chs)) return ''; 50 // 执行批处理句柄 51 $active = null; 52 do { 53 $mrc = curl_multi_exec($chs['mh'],$active); 54 //$info = curl_multi_info_read($chs['mh']); 55 } while ($active > 0); 56 // 获取数据 57 $responses = array(); 58 foreach($chs['handle'] as $k=>$ch){ 59 if(curl_error($ch['ch'])){ 60 $error .= "\n".'error提示:'.curl_error($ch['ch']).'-------URL:'.$ch['url'].'--------时间:'.date('Y-d-m H:i:s',time())."\n"; 61 } else { 62 $responses[$k]['data'] = curl_multi_getcontent( $ch['ch'] ); 63 } 64 65 //curl_multi_info_read($mh); 66 // close current handler 67 curl_multi_remove_handle($chs['mh'], $ch['ch']); 68 curl_close($ch['ch']); 69 } 70 //关闭curl 批处理 71 curl_multi_close($chs['mh']); 72 $str = ''; 73 if($error != '') { 74 $this->_writeFile('get_rank_log.txt', $error, 'ab+'); 75 } 76 foreach ($responses as $val) { 77 if(!empty($val['data'])) { 78 $str.= $this->_get_keyword_link_preg($val['data']); 79 } 80 } 81 $str = substr($str, 0 ,-1); 82 $contents = explode('|', $str); 83 return $contents; 84 } 85 86 // 过滤数据 获取链接 87 private function _get_keyword_link_preg ($str) { 88 $res = ''; 89 if(empty($str)) { 90 return ''; 91 } 92 $arr = explode('<div id="web">', $str); 93 $arr1 = explode('<div id="posS" class="spns">', $arr[1]); 94 $arr2 = preg_replace('#<div id=\"pg\">[\s\S]+#', '', $arr1[0]); 95 $arr3 = preg_replace('#<div id=\"rel\">[\s\S]+#', '', $arr2); 96 $arr4 = preg_replace('#<em>[\s\S]+?</em>#', '', $arr3); 97 if(preg_match_all('#href=\"(.*?)\">#',$arr4,$arr5) !== false) { 98 foreach($arr5[1] as $val) { 99 $res.= urldecode($val).'|';100 }101 }102 return $res;103 }104 105 // 写入文件106 public function _writeFile($fileName, $data, $method="rb+", $iflock=1, $check=1, $chmod=1){107 $check && @strpos($this->path.'/'.$fileName, '..')!==false && exit('403 Forbidden!');108 @touch($this->path.'/'.$fileName);109 $handle = @fopen($this->path.'/'.$fileName, $method);110 if($iflock) {111 @flock($handle,LOCK_EX);112 }113 $fw = @fwrite($handle,$data);114 if($method == "rb+") ftruncate($handle, strlen($data));115 fclose($handle);116 $chmod && @chmod($this->path.'/'.$fileName,0777);117 }118 }119 ?>
1 function array_unique_fb($array){ 2 $temp = array(); 3 $data = array(); 4 foreach ($array as $value){ 5 $value = join(",",$value); //降维,也可以用implode,将一维数组转换为用逗号连接的字符串 6 $temp[] = $value; 7 } 8 $temp = array_flip(array_flip($temp)); //去掉重复的字符串,也就是重复的一维数组 9 foreach ($temp as $k => $value){10 $temp[$k] = explode(",",$value); //再将拆开的数组重新组装11 }12 foreach ($temp as $key => $value) {13 $data[$key]['keyword'] = $value[0];14 $data[$key]['domain'] = $value[1];15 }16 return $data;17 }