注意事項
二進制傳輸時,首字母不能為符號[@],否則發(fā)送不出去 環(huán)境復現(xiàn)【PHP5.4 】
新版
<?php
/**
* Created by PhpStorm.
* User: aoshi
* Date: 2020/12/28
* Time: 15:18
*/
namespace Cron\Controller;
class CrawltestController extends BaseController
{
protected $cookie = array();
protected $referer = '';
/**
* 登錄
* */
public function login() {
//獲取cookie
$url = '********';
$agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
$res = $this->curlCore($url,'get',array('agent'=>$agent));
if(!$res) {
exit('this is over width first curl error');
}
$responseHeader = $this->explainHeader($res['response_header']);
$data = array(
'account'=>'****.hz.cn',
'password'=>'*****',
'Sumit'=>'submit'
);
$params = array(
'content_type'=>'urlencode',
'data'=>$data,
'referer'=>$url,
'agent'=>$agent,
'headers'=>$this->getHeader('post'),
);
$resSecond = $this->curlCore($url,'post',$params);
if(!$res) {
exit('this is over width second curl error');
}
$responseHeaderSecond = $this->explainHeader($resSecond['response_header']);
//獲取數(shù)據(jù)
$baseUrl = '********';
$referBase = '*******';
for($i = 0;$i<10;$i++) {
if(isset($listUrl) && $listUrl) {
$this->referer = $listUrl;
} else {
$this->referer = $referBase;
}
if($i) {
$listUrl = $baseUrl . '?offset=' . $i *100;
} else {
$listUrl = $baseUrl;
}
$params = array(
'referer'=>$this->referer,
'agent'=>$agent,
'headers'=>$this->getHeader(),
);
$htmlStr = $this->curlCore($listUrl,'get',$params);
$this->explanHtml($htmlStr);
sleep(2);
}
var_export($this->cookie);
exit;
$data = array(
'account'=>'cococao.hz.cn',
'password'=>'cc191101',
'Sumit'=>'submit'
);
sleep(1);
var_export($this->cookie);
echo PHP_EOL . PHP_EOL;
$this->curlRequest($url,$this->referer,$this->getHeader('post'),2,$data);
var_export($this->cookie);
echo PHP_EOL . PHP_EOL;
exit();
//exit();
sleep(1);
//獲取數(shù)據(jù)
$baseUrl = '*******';
$referBase = '*************';
for($i = 0;$i<10;$i++) {
if(isset($listUrl) && $listUrl) {
$this->referer = $listUrl;
} else {
$this->referer = $referBase;
}
if($i) {
$listUrl = $baseUrl . '?offset=' . $i *100;
} else {
$listUrl = $baseUrl;
}
$htmlStr = $this->curlRequest($listUrl,$this->referer,$this->getHeader('get'),1,$data);
var_export($htmlStr);exit;
$this->explanHtml($htmlStr);
sleep(2);
}
}
/**
* 拼裝header頭
* @param int $type 是否表單請求 1|表單請求
* */
public function getHeader($method = 'get') {
$method = strtoupper($method);
$headersMap = array(
'cookie'=>$this->cookie,
);
if($method == 'POST') {
$headersMap['Content-type'] = 'application/x-www-form-urlencoded';
} else {
$headersMap['Content-type'] = 'Content-type:application/json;charset=utf-8';
$headersMap['Accept'] = 'application/json';
}
$headers = array();
foreach($headersMap as $headerKey => $headerVal) {
if(is_array($headerVal)) { //同一個header頭不要有換行
$headerVal = implode(" ",$headerVal);
}
$headers[] = $headerKey . ' : ' . $headerVal;
}
return $headers;
}
/**
* curl核心
* @param string $url 請求地址
* @param string $method 方法
* @param array $params 其余參數(shù) array() content_type data referer agent headers timeout
* */
public function curlCore($url,$method = 'GET',$params) {
$method = strtoupper($method);
$timeOut = $params['time_out'] ? $params['time_out'] : 10;
$ch = curl_init();
if($params['ssl']) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, true); //設置為FALSE 禁止 cURL 驗證對等證書
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, true); //false|0 不校驗 設置為 1 是檢查服務器SSL證書中是否存在一個公用名 設置成 2,會檢查公用名是否存在,并且是否與提供的主機名匹配。
curl_setopt($ch,CURLOPT_CAINFO,$params['cacert_pem']); //一個保存著1個或多個用來讓服務端驗證的證書的文件名。這個選項是和CURLOPT_SSL_VERIFYPEER一起使用的。
curl_setopt($ch,CURLOPT_CAPATH ,$params['cacert_path']); // 一個保存著多個CA證書的目錄。這個選項是和CURLOPT_SSL_VERIFYPEER一起使用的。
}
curl_setopt($ch, CURLOPT_URL, $url);
if(is_array($params['data'])){
switch($params['content_type']) {
case 'urlencode': //Content-Type 被指定為 application/x-www-form-urlencoded;其次,提交的數(shù)據(jù)按照 key1=val1&key2=val2 的方式進行編碼
$data = http_build_query($params['data']); //
break;
case 'json':
$data = json_encode($params['data']);
break;
default:
$data = $params['data'];
break;
}
}
switch($method) {
case 'GET':
curl_setopt($ch, CURLOPT_HTTPGET, true);//TRUE 時會設置 HTTP 的 method 為 GET,由于默認是 GET,所以只有 method 被修改時才需要這個選項。
break;
case 'POST':
#curl_setopt($ch, CURLOPT_POST,true);//TRUE 時會發(fā)送 POST 請求,類型為:application/x-www-form-urlencoded,是 HTML 表單提交時最常見的一種。
#curl_setopt($ch, CURLOPT_NOBODY, true);//TRUE 時將不輸出 BODY 部分。同時 Mehtod 變成了 HEAD。修改為 FALSE 時不會變成 GET。
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");//HTTP 請求時,使用自定義的 Method 來代替"GET"或"HEAD"。對 "DELETE" 或者其他更隱蔽的 HTTP 請求有用。 有效值如 "GET","POST","CONNECT"等等;
//設置提交的信息
curl_setopt($ch, CURLOPT_POSTFIELDS,$data);//全部數(shù)據(jù)使用HTTP協(xié)議中的 "POST" 操作來發(fā)送。
break;
case 'PUT':
curl_setopt ($ch, CURLOPT_CUSTOMREQUEST, "PUT");
curl_setopt($ch, CURLOPT_POSTFIELDS,$data);
break;
case 'DELETE':
curl_setopt ($ch, CURLOPT_CUSTOMREQUEST, "DELETE");
curl_setopt($ch, CURLOPT_POSTFIELDS,$data);
break;
}
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_AUTOREFERER, true); //遇到重定向時 自動填充referer
curl_setopt($ch, CURLOPT_MAXREDIRS, 3); //指定最多的HTTP重定向的數(shù)量,這個選項是和CURLOPT_FOLLOWLOCATION一起使用的。
curl_setopt($ch, CURLOPT_UNRESTRICTED_AUTH, true); //在使用CURLOPT_FOLLOWLOCATION產(chǎn)生的header中的多個locations中持續(xù)追加用戶名和密碼信息,即使域名已發(fā)生改變
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); //啟用時會將服務器服務器返回的"Location: "放在header中遞歸的返回給服務器,使用CURLOPT_MAXREDIRS可以限定遞歸返回的數(shù)量。
curl_setopt($ch, CURLOPT_HEADER, true); //true:啟用時會將頭文件的信息作為數(shù)據(jù)流輸出 可以在curl_exec中截取 false:不以數(shù)據(jù)流返回
curl_setopt($ch, CURLINFO_HEADER_OUT, true); //啟用時追蹤句柄的請求字符串。 1|是 0|否 通過 curl_getinfo($ch, CURLINFO_HEADER_OUT) 獲取請求頭信息
curl_setopt($ch, CURLOPT_NOBODY, false); //TRUE 時將數(shù)據(jù)流不輸出 BODY 部分。同時 Mehtod 變成了 HEAD。修改為 FALSE 時不會變成 GET。
if($params['referer']){
curl_setopt($ch, CURLOPT_REFERER, $params['referer']); //設置在HTTP請求頭中"Referer: "的內(nèi)容
}
if($params['agent']) {
curl_setopt($ch, CURLOPT_USERAGENT, $params['agent']); //設置在HTTP請求頭中"User-Agent: "的內(nèi)容
}
curl_setopt($ch, CURLOPT_TIMEOUT, $timeOut); // 設置超時限制防止死循環(huán)
if($params['headers']) {
curl_setopt($ch, CURLOPT_HTTPHEADER,$params['headers']); //一個用來設置HTTP頭字段的數(shù)組。使用如下的形式的數(shù)組進行設置
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //將curl_exec()獲取的信息以文件流的形式返回,而不是直接輸出。
curl_setopt($ch, CURLOPT_VERBOSE, true); //打印請求細節(jié)信息到std 調(diào)試使用
$responseStream = curl_exec($ch); //返回的數(shù)據(jù)流 包括header頭
$responseHeaderSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); //獲取 response header頭大小
$requestHeader = curl_getinfo($ch, CURLINFO_HEADER_OUT); //獲取 request header頭
$connectTime = curl_getinfo($ch, CURLINFO_CONNECT_TIME); //建立連接消耗的時間
$preTransferTime = curl_getinfo($ch, CURLINFO_PRETRANSFER_TIME); //從建立連接到準備傳輸所使用的時間
$startTransferTime = curl_getinfo($ch, CURLINFO_STARTTRANSFER_TIME); //從建立連接到傳輸開始所使用的時間
$redirectTime = curl_getinfo($ch, CURLINFO_REDIRECT_TIME); //從建立連接到傳輸開始所使用的時間
$totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME); //最后一次傳輸所消耗的時間
$responseContentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); //下載內(nèi)容的Content-Type:值,NULL表示服務器沒有發(fā)送有效的Content-Type: header
curl_close($ch);
//抓取文件類型
if($responseStream === false) {
$this->error = '參數(shù)錯誤';
return false;
} else {
$responseHeader = substr($responseStream, 0, $responseHeaderSize); //返回header頭
$responseBody = substr($responseStream, $responseHeaderSize);
$responseBody = $responseBody ? $responseBody : ''; //body體為空時 防止返回false
return array(
'response_header'=>$responseHeader,
'response_body'=>$responseBody,
'requestHeader'=>$requestHeader,
'connectTime'=>$connectTime,
'preTransferTime'=>$preTransferTime,
'startTransferTime'=>$startTransferTime,
'redirectTime'=>$redirectTime,
'totalTime'=>$totalTime,
'responseContentType'=>$responseContentType,
);
}
}
/**
* 請求
* */
protected function curlRequest($url,$referUrl,$headers,$requestTyp = 1,$data = array()) {
$agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
$ch = curl_init();
// curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); //SSL證書校驗
// curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLINFO_HEADER_OUT, true);
// curl_setopt($ch, CURLOPT_NOBODY, true);
if($referUrl){
curl_setopt($ch, CURLOPT_REFERER, $referUrl);//帶來的Referer
}
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 設置超時限制防止死循環(huán)
if($headers) {
curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);
}
if($requestTyp == 2) {
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$return_str = curl_exec($ch);
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$requestStr = curl_getinfo($ch, CURLINFO_HEADER_OUT);
curl_close($ch);
//抓取文件類型
$header = substr($return_str, 0, $header_size);
curl_close($ch);
if($return_str === false) {
exit('error with stop');
} else {
var_export($requestStr) . PHP_EOL;
var_export($header) . PHP_EOL;
$this->explainHeader($url,$header);
}
return $return_str;
}
/**
* 請求
* */
protected function curlRequestNew($url,$referUrl,$headers,$requestTyp = 1,$data = array()) {
$agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
$ch = curl_init();
// curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); //設置為FALSE 禁止 cURL 驗證對等證書
// curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); //false|0 不校驗 設置為 1 是檢查服務器SSL證書中是否存在一個公用名 設置成 2,會檢查公用名是否存在,并且是否與提供的主機名匹配。
// curl_setopt($ch,CURLOPT_CAINFO,dirname(__FILE__).'/cacert.pem'); //一個保存著1個或多個用來讓服務端驗證的證書的文件名。這個選項是和CURLOPT_SSL_VERIFYPEER一起使用的。
// curl_setopt($ch,CURLOPT_CAPATH ,dirname(__FILE__).'/'); // 一個保存著多個CA證書的目錄。這個選項是和CURLOPT_SSL_VERIFYPEER一起使用的。
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true); //啟用時會將頭文件的信息作為數(shù)據(jù)流輸出
curl_setopt($ch, CURLINFO_HEADER_OUT, true); //啟用時追蹤句柄的請求字符串。 1|是 0|否 通過 curl_getinfo($ch, CURLINFO_HEADER_OUT) 獲取請求頭信息
// curl_setopt($ch, CURLOPT_NOBODY, true);
if($referUrl){
curl_setopt($ch, CURLOPT_REFERER, $referUrl); //設置在HTTP請求頭中"Referer: "的內(nèi)容
}
curl_setopt($ch, CURLOPT_USERAGENT, $agent); //設置在HTTP請求頭中"User-Agent: "的內(nèi)容
curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 設置超時限制防止死循環(huán)
if($headers) {
curl_setopt($ch, CURLOPT_HTTPHEADER,$headers); //一個用來設置HTTP頭字段的數(shù)組。使用如下的形式的數(shù)組進行設置
}
if($requestTyp == 2) {
curl_setopt($ch, CURLOPT_POST, true); //啟用時會發(fā)送一個常規(guī)的POST請求,類型為:application/x-www-form-urlencoded,就像表單提交的一樣
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data)); //全部數(shù)據(jù)使用HTTP協(xié)議中的"POST"操作來發(fā)送。要發(fā)送文件,在文件名前面加上@前綴并使用完整路徑。這個參數(shù)可以通過urlencoded后的字符串類似'para1=val1¶2=val2&...'或使用一個以字段名為鍵值,字段數(shù)據(jù)為值的數(shù)組。如果value是一個數(shù)組,Content-Type頭將會被設置成multipart/form-data
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //將curl_exec()獲取的信息以文件流的形式返回,而不是直接輸出。
$return_str = curl_exec($ch);
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); //獲取 response header頭大小
$requestStr = curl_getinfo($ch, CURLINFO_HEADER_OUT); //獲取 request header頭
curl_close($ch);
//抓取文件類型
$header = substr($return_str, 0, $header_size);
curl_close($ch);
if($return_str === false) {
exit('error with stop');
} else {
var_export($requestStr) . PHP_EOL;
$this->explainHeader($url,$header);
}
return $return_str;
}
/**
* 解析header頭
* @param string $url 請求鏈接
* @param string $header response header頭
* return array
* */
public function explainHeader($header) {
$headArr = explode("\r\n",$header);
var_export($headArr);
$map = array();
foreach($headArr as $val) {
$pos = strpos($val,':');
if($pos) {
$mapKey = trim(substr($val,0,$pos)); //header頭 key
$mapValue = trim(substr($val,($pos+1))); //header頭 value
$mapKey = strtolower($mapKey);
if($mapKey == 'set-cookie') { //cookie去掉路徑限制
$subLength = strpos($mapValue,'path=/');
if($subLength) {
$mapValue = substr($mapValue,0,$subLength);
}
$this->cookie[] = $mapValue;
}
if($map[$mapKey]) {
$map[$mapKey] = array(trim($mapValue),$map[$mapKey]);
} else {
$map[$mapKey] = trim($mapValue);
}
}
}
if($map['set-cookie']) {
$map['set-cookie'] = $this->cookie;
}
return $map;
}
/**
* 獲取列表數(shù)據(jù)
* */
public function getList() {
$baseUrl = '*************';
$referBase = '***********************';
$url = '';
for($i = 0;$i<10;$i++) {
if($url) {
$referUrl = $url;
} else {
$referUrl = $referBase;
}
if($i) {
$url = $baseUrl . '?offset=' . $i *100;
} else {
$url = $baseUrl;
}
$htmlStr = $this->curlGet($url,$referUrl);
$this->explanHtml($htmlStr);
sleep(2);
}
exit('this is over');
}
/**
* 請求
* */
protected function curlGet($url,$referUrl) {
$headers = array(
"Content-type:application/json;charset=utf-8",
"Accept:application/json",
"Cookie: ASPSESSIONIDACQSQDSB=GHGNMIJACHNKAHHMJLEGMIDO; ASPSESSIONIDCAQRRDSA=KCDNMALAGDKKMLKLIFECCION; User=UserLocation=HANGZHOU&UserNameEN=Coco+Cao&Logintime=28&UserDept=SALESHEAD&AccountName=cococao%2Ehz%2Ecn"
);
$agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
$ch = curl_init();
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_URL, $url);
if($referUrl){
curl_setopt($ch, CURLOPT_REFERER, $referUrl);//帶來的Referer
}
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 設置超時限制防止死循環(huán)
curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$return_str = curl_exec($ch);
curl_close($ch);
return $return_str;
}
/**
* 解析html
* */
protected function explanHtml($htmlStr){
preg_match_all( '/<table[^>]*?>\s*?<tr>\s*?<td[^>]*?><a href="(.*?)" class="style10">(.*?)<\/a><\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<\/tr>\s*?<\/table>/i' , $htmlStr , $results );
$saveData = array();
foreach($results[1] as $key => $val) {
$saveData['link'] = $val;
$saveData['en_name'] = $results[2][$key];
$saveData['zh_name'] = $results[3][$key];
$saveData['nature'] = $results[4][$key];
$saveData['sales'] = $results[5][$key];
$saveData['supervisor'] = $results[6][$key];
$saveData['content'] = '';
var_export($saveData);exit;
try{
$res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->add($saveData); //如果開啟調(diào)試模式 會直接報錯
if(!$res) {
throw new \Exception($saveData['zh_name']);
}
} catch (\Exception $e) {
echo 'Caught exception: ' . $e->getMessage() . PHP_EOL;
}
}
}
/**
* 解析詳情頁
*
*
* */
public function getInfo() {
$baseUrl = '***********************';
$referBase = '****************************';
$lists = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->field('id,link')->where('id = 1')->select();
foreach($lists as $key => $val) {
$url = $baseUrl . $val['link'];
$offset = floor($val['id'] / 100) * 100;
if($offset) {
$referUrl = $referBase . '?offset=' . $offset;
} else {
$referUrl = $referBase;
}
$htmlUrl = $this->curlGet($url,$referUrl);
M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->save(array('id'=>$val['id'],'content'=>$htmlUrl));
if($key % 100 == 0) {
sleep(1);
}
}
exit('this is over');
}
/**
* 檢測id斷層
* */
public function testId() {
for($i = 1;$i<1000;$i++) {
$res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->find($i);
if(!$res) {
echo $i . PHP_EOL;
}
}
exit('this is over');
}
}
<?php
/**
* Created by PhpStorm.
* User: aoshi
* Date: 2020/12/28
* Time: 15:18
*/
namespace Cron\Controller;
class CrawltestController extends BaseController
{
/**
* 獲取列表數(shù)據(jù)
* */
public function login() {
$baseUrl = '******************************';
$referBase = ''******************************';
for($i = 0;$i<10;$i++) {
if($url) {
$referUrl = $url;
} else {
$referUrl = $referBase;
}
if($i) {
$url = $baseUrl . '?offset=' . $i *100;
} else {
$url = $baseUrl;
}
$htmlStr = $this->curlGet($url,$referUrl);
$this->explanHtml($htmlStr);
sleep(2);
}
exit('this is over');
}
/**
* 請求
* */
protected function curlGet($url,$referUrl) {
$headers = array(
"Content-type:application/json;charset=utf-8",
"Accept:application/json",
"Cookie: ASPSESSIONIDACQSQDSB=GHGNMIJACHNKAHHMJLEGMIDO; ASPSESSIONIDCAQRRDSA=KCDNMALAGDKKMLKLIFECCION; User=UserLocation=HANGZHOU&UserNameEN=Coco+Cao&Logintime=28&UserDept=SALESHEAD&AccountName=cococao%2Ehz%2Ecn"
);
$agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36';
$ch = curl_init();
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_URL, $url);
if($referUrl){
curl_setopt($ch, CURLOPT_REFERER, $referUrl);//帶來的Referer
}
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_TIMEOUT, 10); // 設置超時限制防止死循環(huán)
curl_setopt($ch, CURLOPT_HTTPHEADER,$headers);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$return_str = curl_exec($ch);
curl_close($ch);
return $return_str;
}
/**
* 解析html
* */
protected function explanHtml($htmlStr){
preg_match_all( '/<table[^>]*?>\s*?<tr>\s*?<td[^>]*?><a href="(.*?)" class="style10">(.*?)<\/a><\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<td[^>]*?>(.*?)<\/td>\s*?<\/tr>\s*?<\/table>/i' , $htmlStr , $results );
$saveData = array();
foreach($results[1] as $key => $val) {
$saveData['link'] = $val;
$saveData['en_name'] = $results[2][$key];
$saveData['zh_name'] = $results[3][$key];
$saveData['nature'] = $results[4][$key];
$saveData['sales'] = $results[5][$key];
$saveData['supervisor'] = $results[6][$key];
$saveData['content'] = '';
try{
$res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->add($saveData); //如果開啟調(diào)試模式 會直接報錯
if(!$res) {
throw new \Exception($saveData['zh_name']);
}
} catch (\Exception $e) {
echo 'Caught exception: ' . $e->getMessage() . PHP_EOL;
}
}
}
/**
* 解析詳情頁
*
*
* */
public function getInfo() {
$baseUrl = '******************************/';
$referBase = '******************************';
$lists = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->field('id,link')->where('id = 1')->select();
foreach($lists as $key => $val) {
$url = $baseUrl . $val['link'];
$offset = floor($val['id'] / 100) * 100;
if($offset) {
$referUrl = $referBase . '?offset=' . $offset;
} else {
$referUrl = $referBase;
}
$htmlUrl = $this->curlGet($url,$referUrl);
M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->save(array('id'=>$val['id'],'content'=>$htmlUrl));
if($key % 100 == 0) {
sleep(1);
}
}
exit('this is over');
}
/**
* 檢測id斷層
* */
public function testId() {
for($i = 1;$i<1000;$i++) {
$res = M('PccAsicLlc','wx_',C('ARTICLE_DSN'))->find($i);
if(!$res) {
echo $i . PHP_EOL;
}
}
exit('this is over');
}
}