Hi!请登陆

使用PHP的CURL模拟POST采集

2020-10-31 64 10/31

这个函数很不错,curl函数中的header还可以加入随机UA等功能防屏蔽,以后改改做自己的采集类!

/**
    QQ群:223494678
    函数:模拟post得到所有分页的页面信息
    参数:
        string $EVENTARGUMENT
        string $VIEWSTATE
        string $EVENTVALIDATION
        string $EVENTTARGET
    返回:
        string
/**/
function getn($EVENTARGUMENT = "", $VIEWSTATE = "", $EVENTVALIDATION = "", $EVENTTARGET = "pager"){
    $args = array();
    if($EVENTARGUMENT){
        $args = array(
            '__EVENTTARGET'=>$EVENTTARGET,
            '__EVENTARGUMENT'=>$EVENTARGUMENT,
            '__VIEWSTATE'=>$VIEWSTATE,
            '__EVENTVALIDATION'=>$EVENTVALIDATION,
            '__VIEWSTATEENCRYPTED'=>'',
            'search$txtFundName='=>'',
            'search$txtFundManger'=>'',
            'search$ddlFoundationDateOperater'=>'1',
            'search$txtFoundationDate'=>'',
            'search$dltFundType$ctl01$chkFundType'=>'on',
            'search$dltFundType$ctl01$chklFundChildType$0'=>'on',
            'search$dltFundType$ctl01$chklFundChildType$1'=>'on',
            'search$dltFundType$ctl01$chklFundChildType$2'=>'on',
            'search$dltFundType$ctl01$chklFundChildType$3'=>'on',
            'search$dltFundType$ctl01$chklFundChildType$4'=>'on',
            'search$chklFundStatus$0'=>'on',
            'search$ddlFundOrg'=>'0',
            'search$txtFundOrgName'=>'',
            'search$ddlStatisticDateOperater'=>'1',
            'search$txtStatisticDate'=>'',
            'search$radlStatisticMode'=>'1'
        );
    }
    $user_agent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11";
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, 'http://???/default.aspx');
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);// 设为TRUE让结果不要直接输出
    curl_setopt($ch, CURLOPT_VERBOSE, TRUE);
    curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
    curl_setopt($ch, CURLOPT_FAILONERROR, TRUE);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
    curl_setopt($ch, CURLOPT_HEADER, TRUE);
    curl_setopt($ch, CURLINFO_HEADER_OUT, TRUE);
    curl_setopt($ch, CURLOPT_HTTPHEADER, array(
    'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language:zh-CN,zh;q=0.8',
    'Connection: Keep-Alive',
    'Cache-Control:max-age=0',
    'Referer:http://???/default.aspx',
    'Expect:'
    ));
    curl_setopt($ch, CURLOPT_POST, true); //启用POST提交
    curl_setopt($ch, CURLOPT_POSTFIELDS, $args); //设置POST提交的字符串
    curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);//HTTP请求User-Agent:头
    $document = curl_exec($ch); //执行预定义的CURL
    return $document;
}
/**
    QQ群:223494678
    函数:根据模拟post所得的页面信息,提取所需post的数据和分页,最后分解需要的html
    返回:
        string
/**/
function getHtml(){
    global $html;
    $first = getn();
    preg_match('/<font color="black"><b>(\d+?)<\/b><\/font> 页<\/span>/is', $first, $matches);
    $total = $matches[1];
    preg_match('/<table id="dltData".+?<!-- AspNetPager/is', $first, $matches);
    $html .= str_replace('<!-- AspNetPager', '', $matches[0]);
    $VIEWSTATE = "";
    $EVENTVALIDATION = "";
    preg_match('/<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.+?)"/is', $first, $matches);
    $VIEWSTATE = $matches[1];
    preg_match('/<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.+?)"/is', $first, $matches);
    $EVENTVALIDATION = $matches[1];
    for($i = 2; $i <= $total; $i++){
        sleep(1);
        $EVENTARGUMENT = $i;
        $result = getn($EVENTARGUMENT, $VIEWSTATE, $EVENTVALIDATION);
        preg_match('/<table id="dltData".+?<!-- AspNetPager/is', $result, $matches);
        $html .= str_replace('<!-- AspNetPager', '', $matches[0]);
        $VIEWSTATE = "";
        $EVENTVALIDATION = "";
        preg_match('/<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.+?)"/is', $result, $matches);
        $VIEWSTATE = $matches[1];
        preg_match('/<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.+?)"/is', $result, $matches);
        $EVENTVALIDATION = $matches[1];
    }
    return $html;
}

说明:
getHtml是採集入口文件,裏面先取每一頁的數據,並提取頁碼等數據,然後循環採集後面的數據,getn是採集函數,主要是CURL模擬POST了。

相关推荐