标题:php抓取百度推广中url自动保存在txt文件中代码

-------------------------------------------------------------------------------------------------------------------------------

时间:2013/8/6 20:58:04

-------------------------------------------------------------------------------------------------------------------------------

内容:

<?php

$fp = @fopen("semallurl.txt", "a+");
$kws1="上海酒店,北京酒店,广州酒店,天津酒店,广州酒店";
$kws=explode(",",$kws1);
foreach ($kws as $kw){
  $keywords=$kw;
  $enkeywords=urlencode($keywords);
  $pageURL="http://www.baidu.com/s?word=$enkeywords";
  $contents=fetch($pageURL); /*抓取页面*/  
  $contents=preg_replace ('/<script[^>]*?>.*?<\/script>/' , "", $contents);  /*去掉js代码*/
  $contents_left="";
  $contents_right="";
  $ads_left_green="";
  $ads_left_white="";
  $contentsbytwoside="";
  $ads_right="";/*变量初始化*/
  $contentsbytwoside=explode('<div id="content_',$contents);
  $contents_right=$contentsbytwoside[1];
  $contents_right='<div id="content_'.$contents_right;/*搜索结果右边部分*/
  $contents_left=$contentsbytwoside[2];
  $contents_left='<div id="content_'.$contents_left;/*搜索结果左边部分*/
  preg_match_all('/(<div id=\"[0-9]*\" class=\"ec_pp_f ec_pp_top.*?)<a href=\"http:\/\/e\.baidu\.com\//',$contents_left,$ads_left_white);
  preg_match_all('/(<table class=\"EC_mr15 EC_ppim_top ec_pp_f.*?<\/table>)/',$contents_left,$ads_left_green);
  preg_match_all('(<div id=\"bdfs[^>]*class=\"EC_im EC_fr EC_PP  EC_idea1017 \">.*?<a class=\"EC_BL EC_desc\".*?<\/a>)',$contents_right,$ads_right);
  echo "------------Keywords ads for".$kw."start ------------------------------------<br>" ;
  fwrite($fp,  "----------".$kw . " ads start------------------------- \r\n");

 
  echo "left ads with green background is<br>";
  /*print_r($ads_left_green[0]);*/
   foreach ($ads_left_green[0] as $tg1)
   {
    preg_match('/<span>.*?<\/span>/' , $tg1,$tg11); 
     fwrite($fp,strip_tags($tg11[0]) . "\r\n");
     echo $tg11[0]."<br>";
   };

 

  echo "<p>-------------<br>" ;
  echo "left ads with white background is<br>";
  /*print_r($ads_left_white[0]);*/
   foreach ($ads_left_white[0] as $tg2)
   {
 
    preg_match('/<span class=\"ec_url\">.*?<\/span>/' , $tg2,$tg22); 
     fwrite($fp,strip_tags($tg22[0]) . "\r\n");
    echo $tg22[0]."<br>";
   };

  echo "<p>-------------<br>" ;
  echo "right ads with is<br>";
  /*print_r($ads_right[0]);*/

   foreach ($ads_right[0] as $tg3)
   {

    preg_match('/(<font size=\"-1\" class=\"EC_url\">.*?<\/font>)/' , $tg3,$tg33); 
     fwrite($fp,strip_tags($tg33[0]) . "\r\n");
    echo $tg33[0]."<br>";
   };

  echo "---------------Keywords ads for".$kw."END ------------------------------------<br>" ;
  fwrite($fp, "----------".$kw . " ads End------------------------- \r\n");
};

 

fwrite($fp, date("Y-m-d H:i:s") . " PHP代码自动运行!\r\n");

fclose($fp);

function fetch($Date){
$ch = curl_init();
$timeout = 5;
curl_setopt ($ch, CURLOPT_URL, "$Date");
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$contents = curl_exec($ch);
curl_close($ch);
return $contents;
}
?>

 

<?php

$fp = @fopen("semallurl.txt", "a+");
$kws1="上海酒店,北京酒店,广州酒店,天津酒店,广州酒店";
$kws=explode(",",$kws1);
foreach ($kws as $kw){
  $keywords=$kw;
  $enkeywords=urlencode($keywords);
  $pageURL="http://www.baidu.com/s?word=$enkeywords";
  $contents=fetch($pageURL); /*抓取页面*/  
  $contents=preg_replace ('/<script[^>]*?>.*?<\/script>/' , "", $contents);  /*去掉js代码*/
  $contents_left="";
  $contents_right="";
  $ads_left_green="";
  $ads_left_white="";
  $contentsbytwoside="";
  $ads_right="";/*变量初始化*/
  $contentsbytwoside=explode('<div id="content_',$contents);
  $contents_right=$contentsbytwoside[1];
  $contents_right='<div id="content_'.$contents_right;/*搜索结果右边部分*/
  $contents_left=$contentsbytwoside[2];
  $contents_left='<div id="content_'.$contents_left;/*搜索结果左边部分*/
  preg_match_all('/(<div id=\"[0-9]*\" class=\"ec_pp_f ec_pp_top.*?)<a href=\"http:\/\/e\.baidu\.com\//',$contents_left,$ads_left_white);
  preg_match_all('/(<table class=\"EC_mr15 EC_ppim_top ec_pp_f.*?<\/table>)/',$contents_left,$ads_left_green);
  preg_match_all('(<div id=\"bdfs[^>]*class=\"EC_im EC_fr EC_PP  EC_idea1017 \">.*?<a class=\"EC_BL EC_desc\".*?<\/a>)',$contents_right,$ads_right);
  echo "------------Keywords ads for".$kw."start ------------------------------------<br>" ;
  fwrite($fp,  "----------".$kw . " ads start------------------------- \r\n");

 
  echo "left ads with green background is<br>";
  /*print_r($ads_left_green[0]);*/
   foreach ($ads_left_green[0] as $tg1)
   {
    preg_match('/<span>.*?<\/span>/' , $tg1,$tg11); 
     fwrite($fp,strip_tags($tg11[0]) . "\r\n");
     echo $tg11[0]."<br>";
   };

 

  echo "<p>-------------<br>" ;
  echo "left ads with white background is<br>";
  /*print_r($ads_left_white[0]);*/
   foreach ($ads_left_white[0] as $tg2)
   {
 
    preg_match('/<span class=\"ec_url\">.*?<\/span>/' , $tg2,$tg22); 
     fwrite($fp,strip_tags($tg22[0]) . "\r\n");
    echo $tg22[0]."<br>";
   };

  echo "<p>-------------<br>" ;
  echo "right ads with is<br>";
  /*print_r($ads_right[0]);*/

   foreach ($ads_right[0] as $tg3)
   {

    preg_match('/(<font size=\"-1\" class=\"EC_url\">.*?<\/font>)/' , $tg3,$tg33); 
     fwrite($fp,strip_tags($tg33[0]) . "\r\n");
    echo $tg33[0]."<br>";
   };

  echo "---------------Keywords ads for".$kw."END ------------------------------------<br>" ;
  fwrite($fp, "----------".$kw . " ads End------------------------- \r\n");
};

 

fwrite($fp, date("Y-m-d H:i:s") . " PHP代码自动运行!\r\n");

fclose($fp);

function fetch($Date){
$ch = curl_init();
$timeout = 5;
curl_setopt ($ch, CURLOPT_URL, "$Date");
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$contents = curl_exec($ch);
curl_close($ch);
return $contents;
}
?>